|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import socket |
| 3 | +import threading |
| 4 | +import json |
| 5 | +import os |
| 6 | +import time |
| 7 | +import sqlite3 |
| 8 | +from llama_cpp import Llama |
| 9 | +from pathlib import Path |
| 10 | + |
| 11 | +# ─── CONFIG ──────────────────────────────────────────────────────────────────── |
| 12 | +HOST = '127.0.0.1' |
| 13 | +PORT = 5005 |
| 14 | +MODEL_PATH = '/home/autosecure/FYP/llama.cpp/models/Llama-3-WhiteRabbitNeo-8B-v2.0.Q4_K_M.gguf' |
| 15 | +#MODEL_PATH = '/home/autosecure/FYP/llama.cpp/models/Llama-3-WhiteRabbitNeo-8B-v2.0.Q4_K_S.gguf' |
| 16 | +CACHE_DB = 'llama_cache.db' |
| 17 | +THREADS = 4 |
| 18 | +N_CTX = 256 |
| 19 | +MAX_TOKENS = 512 |
| 20 | +TEMPERATURE = 0.0 |
| 21 | +REPEAT_PENALTY = 1.1 |
| 22 | +GPU_LAYERS = 10 |
| 23 | +# ──────────────────────────────────────────────────────────────────────────────── |
| 24 | + |
| 25 | +# Thread-safe SQLite connection handler |
| 26 | +class DiskCache: |
| 27 | + def __init__(self): |
| 28 | + self.connections = threading.local() |
| 29 | + Path(CACHE_DB).touch() |
| 30 | + |
| 31 | + def get_conn(self): |
| 32 | + if not hasattr(self.connections, 'db'): |
| 33 | + self.connections.db = sqlite3.connect(CACHE_DB, check_same_thread=False) |
| 34 | + self.connections.db.execute('''CREATE TABLE IF NOT EXISTS cache |
| 35 | + (prompt TEXT PRIMARY KEY, response TEXT)''') |
| 36 | + return self.connections.db |
| 37 | + |
| 38 | + def get(self, prompt): |
| 39 | + conn = self.get_conn() |
| 40 | + cursor = conn.cursor() |
| 41 | + cursor.execute("SELECT response FROM cache WHERE prompt=?", (prompt,)) |
| 42 | + return cursor.fetchone() |
| 43 | + |
| 44 | + def set(self, prompt, response): |
| 45 | + conn = self.get_conn() |
| 46 | + conn.execute("INSERT OR REPLACE INTO cache VALUES (?, ?)", |
| 47 | + (prompt, response)) |
| 48 | + conn.commit() |
| 49 | + |
| 50 | +# Initialize cache |
| 51 | +cache = DiskCache() |
| 52 | + |
| 53 | +if not os.path.isfile(MODEL_PATH): |
| 54 | + print(f"[!] Model not found at {MODEL_PATH}") |
| 55 | + exit(1) |
| 56 | + |
| 57 | +def load_model(): |
| 58 | + print(f"[+] Loading model with {GPU_LAYERS} GPU layers...") |
| 59 | + llm = Llama( |
| 60 | + model_path=MODEL_PATH, |
| 61 | + n_threads=THREADS, |
| 62 | + n_ctx=N_CTX, |
| 63 | + n_gpu_layers=GPU_LAYERS, |
| 64 | + use_mmap=True, |
| 65 | + use_mlock=False, |
| 66 | + verbose=True |
| 67 | + ) |
| 68 | + |
| 69 | + print("[+] Warming up model...") |
| 70 | + llm("Warmup", max_tokens=1, temperature=0) |
| 71 | + return llm |
| 72 | + |
| 73 | +llm = load_model() |
| 74 | +print("[+] Model ready!") |
| 75 | + |
| 76 | +def handle_client(conn, addr): |
| 77 | + print(f"[>] Connection from {addr}") |
| 78 | + try: |
| 79 | + raw = conn.recv(4096).decode('utf-8') |
| 80 | + req = json.loads(raw) |
| 81 | + prompt = req['prompt'].strip() |
| 82 | + |
| 83 | + print(f"[>] Processing: {prompt[:60]}...") |
| 84 | + start = time.time() |
| 85 | + |
| 86 | + # Check cache |
| 87 | + if cached := cache.get(prompt): |
| 88 | + print("[+] Cache hit") |
| 89 | + conn.sendall(cached[0].encode('utf-8')) |
| 90 | + return |
| 91 | + |
| 92 | + # Stream new response |
| 93 | + response = [] |
| 94 | + stream = llm( |
| 95 | + prompt=prompt, |
| 96 | + max_tokens=MAX_TOKENS, |
| 97 | + temperature=TEMPERATURE, |
| 98 | + repeat_penalty=REPEAT_PENALTY, |
| 99 | + stream=True |
| 100 | + ) |
| 101 | + |
| 102 | + for chunk in stream: |
| 103 | + token = chunk['choices'][0]['text'] |
| 104 | + if token.strip(): |
| 105 | + response.append(token) |
| 106 | + conn.sendall(token.encode('utf-8')) |
| 107 | + |
| 108 | + # Cache complete response |
| 109 | + cache.set(prompt, ''.join(response)) |
| 110 | + print(f"[+] Generated in {time.time()-start:.2f}s") |
| 111 | + |
| 112 | + except Exception as e: |
| 113 | + print(f"[!] Error: {e}") |
| 114 | + conn.sendall(f"ERROR: {e}".encode('utf-8')) |
| 115 | + finally: |
| 116 | + conn.close() |
| 117 | + |
| 118 | +def main(): |
| 119 | + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: |
| 120 | + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) |
| 121 | + s.bind((HOST, PORT)) |
| 122 | + s.listen(5) |
| 123 | + print(f"[+] Server ready on {HOST}:{PORT}") |
| 124 | + |
| 125 | + try: |
| 126 | + while True: |
| 127 | + conn, addr = s.accept() |
| 128 | + threading.Thread( |
| 129 | + target=handle_client, |
| 130 | + args=(conn, addr), |
| 131 | + daemon=True |
| 132 | + ).start() |
| 133 | + except KeyboardInterrupt: |
| 134 | + print("\n[+] Shutting down...") |
| 135 | + |
| 136 | +if __name__ == '__main__': |
| 137 | + main() |
0 commit comments