Skip to content

Commit 6f3ac9a

Browse files
authored
ai-server-file
1 parent 0bd4d37 commit 6f3ac9a

File tree

1 file changed

+137
-0
lines changed

1 file changed

+137
-0
lines changed

ai_server.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#!/usr/bin/env python3
2+
import socket
3+
import threading
4+
import json
5+
import os
6+
import time
7+
import sqlite3
8+
from llama_cpp import Llama
9+
from pathlib import Path
10+
11+
# ─── CONFIG ────────────────────────────────────────────────────────────────────
12+
HOST = '127.0.0.1'
13+
PORT = 5005
14+
MODEL_PATH = '/home/autosecure/FYP/llama.cpp/models/Llama-3-WhiteRabbitNeo-8B-v2.0.Q4_K_M.gguf'
15+
#MODEL_PATH = '/home/autosecure/FYP/llama.cpp/models/Llama-3-WhiteRabbitNeo-8B-v2.0.Q4_K_S.gguf'
16+
CACHE_DB = 'llama_cache.db'
17+
THREADS = 4
18+
N_CTX = 256
19+
MAX_TOKENS = 512
20+
TEMPERATURE = 0.0
21+
REPEAT_PENALTY = 1.1
22+
GPU_LAYERS = 10
23+
# ────────────────────────────────────────────────────────────────────────────────
24+
25+
# Thread-safe SQLite connection handler
26+
class DiskCache:
27+
def __init__(self):
28+
self.connections = threading.local()
29+
Path(CACHE_DB).touch()
30+
31+
def get_conn(self):
32+
if not hasattr(self.connections, 'db'):
33+
self.connections.db = sqlite3.connect(CACHE_DB, check_same_thread=False)
34+
self.connections.db.execute('''CREATE TABLE IF NOT EXISTS cache
35+
(prompt TEXT PRIMARY KEY, response TEXT)''')
36+
return self.connections.db
37+
38+
def get(self, prompt):
39+
conn = self.get_conn()
40+
cursor = conn.cursor()
41+
cursor.execute("SELECT response FROM cache WHERE prompt=?", (prompt,))
42+
return cursor.fetchone()
43+
44+
def set(self, prompt, response):
45+
conn = self.get_conn()
46+
conn.execute("INSERT OR REPLACE INTO cache VALUES (?, ?)",
47+
(prompt, response))
48+
conn.commit()
49+
50+
# Initialize cache
51+
cache = DiskCache()
52+
53+
if not os.path.isfile(MODEL_PATH):
54+
print(f"[!] Model not found at {MODEL_PATH}")
55+
exit(1)
56+
57+
def load_model():
58+
print(f"[+] Loading model with {GPU_LAYERS} GPU layers...")
59+
llm = Llama(
60+
model_path=MODEL_PATH,
61+
n_threads=THREADS,
62+
n_ctx=N_CTX,
63+
n_gpu_layers=GPU_LAYERS,
64+
use_mmap=True,
65+
use_mlock=False,
66+
verbose=True
67+
)
68+
69+
print("[+] Warming up model...")
70+
llm("Warmup", max_tokens=1, temperature=0)
71+
return llm
72+
73+
llm = load_model()
74+
print("[+] Model ready!")
75+
76+
def handle_client(conn, addr):
77+
print(f"[>] Connection from {addr}")
78+
try:
79+
raw = conn.recv(4096).decode('utf-8')
80+
req = json.loads(raw)
81+
prompt = req['prompt'].strip()
82+
83+
print(f"[>] Processing: {prompt[:60]}...")
84+
start = time.time()
85+
86+
# Check cache
87+
if cached := cache.get(prompt):
88+
print("[+] Cache hit")
89+
conn.sendall(cached[0].encode('utf-8'))
90+
return
91+
92+
# Stream new response
93+
response = []
94+
stream = llm(
95+
prompt=prompt,
96+
max_tokens=MAX_TOKENS,
97+
temperature=TEMPERATURE,
98+
repeat_penalty=REPEAT_PENALTY,
99+
stream=True
100+
)
101+
102+
for chunk in stream:
103+
token = chunk['choices'][0]['text']
104+
if token.strip():
105+
response.append(token)
106+
conn.sendall(token.encode('utf-8'))
107+
108+
# Cache complete response
109+
cache.set(prompt, ''.join(response))
110+
print(f"[+] Generated in {time.time()-start:.2f}s")
111+
112+
except Exception as e:
113+
print(f"[!] Error: {e}")
114+
conn.sendall(f"ERROR: {e}".encode('utf-8'))
115+
finally:
116+
conn.close()
117+
118+
def main():
119+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
120+
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
121+
s.bind((HOST, PORT))
122+
s.listen(5)
123+
print(f"[+] Server ready on {HOST}:{PORT}")
124+
125+
try:
126+
while True:
127+
conn, addr = s.accept()
128+
threading.Thread(
129+
target=handle_client,
130+
args=(conn, addr),
131+
daemon=True
132+
).start()
133+
except KeyboardInterrupt:
134+
print("\n[+] Shutting down...")
135+
136+
if __name__ == '__main__':
137+
main()

0 commit comments

Comments
 (0)