From f07b416baf5d5a4e55e13ed42b70deea3a08866b Mon Sep 17 00:00:00 2001 From: Sebastian Gode Date: Tue, 3 Mar 2026 13:31:19 +0000 Subject: [PATCH] Added keywords --- tools/generate_meta.py | 115 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 102 insertions(+), 13 deletions(-) diff --git a/tools/generate_meta.py b/tools/generate_meta.py index 099704f..d332904 100755 --- a/tools/generate_meta.py +++ b/tools/generate_meta.py @@ -58,10 +58,34 @@ def extract_description(result): return first_sentence +def extract_keywords(result): + """Extract keywords from API response and clean it.""" + if "choices" in result and len(result["choices"]) > 0: + message = result["choices"][0].get("message", {}) + keywords_text = message.get("content", "") + elif "response" in result: + keywords_text = result["response"].strip() + elif isinstance(result, dict) and "text" in result: + keywords_text = result["text"].strip() + else: + return None + + keywords_text = keywords_text.strip() + + if not keywords_text or keywords_text.isspace(): + return None + + keywords = [kw.strip() for kw in keywords_text.split(",")] + keywords = [kw for kw in keywords if kw and len(kw) > 0] + + keywords = keywords[:5] + + return ", ".join(keywords) + + def generate_description_with_llm(text, service_title, llm_api_url, model_name, api_username, api_password): """Generate a meta description using the llama.cpp /completion endpoint with up to 3 retries.""" - # Limit content to first 500 chars for speed, focus on content not schema - content_preview = text[:500].replace("\n", " ") + content_preview = text[:2000].replace("\n", " ") prompt = ( f"Generate a meta description (40-160 chars) for: {service_title}." f"Content preview: {content_preview}." @@ -74,7 +98,6 @@ def generate_description_with_llm(text, service_title, llm_api_url, model_name, encoded_credentials = base64.b64encode(credentials.encode()).decode() headers["Authorization"] = f"Basic {encoded_credentials}" - # Try up to 3 times for attempt in range(3): try: response = requests.post( @@ -84,7 +107,7 @@ def generate_description_with_llm(text, service_title, llm_api_url, model_name, {"role": "user", "content": prompt}, ], "model": model_name, - "temperature": 0.2, + "temperature": 0.5, "top_k": 40, "top_p": 0.9, @@ -133,21 +156,80 @@ def generate_description_with_llm(text, service_title, llm_api_url, model_name, return f"{service_title} documentation" +def generate_keywords_with_llm(text, service_title, llm_api_url, model_name, api_username, api_password): + """Generate keywords using the llama.cpp /completion endpoint with up to 3 retries.""" + content_preview = text[:2000].replace("\n", " ") + prompt = ( + f"Generate up to 5 keywords (comma-separated) for: {service_title}. " + f"Content preview: {content_preview}. " + f"Output ONLY comma-separated keywords, nothing else." + ) + + headers = {"Content-Type": "application/json"} + if api_username and api_password: + credentials = f"{api_username}:{api_password}" + encoded_credentials = base64.b64encode(credentials.encode()).decode() + headers["Authorization"] = f"Basic {encoded_credentials}" + + for attempt in range(3): + try: + response = requests.post( + llm_api_url, + json={ + "messages": [ + {"role": "user", "content": prompt}, + ], + "model": model_name, + "temperature": 0.7, + + "top_k": 40, + "top_p": 0.9, + "min_p": 0.05, + + "repeat_last_n": 256, + "repeat_penalty": 1.18, + "presence_penalty": 0.2, + "frequency_penalty": 0.2, + + "dry_multiplier": 0.8, + "dry_base": 1.75, + "dry_allowed_length": 2, + "dry_penalty_last_n": -1, + "chat_template_kwargs": {"enable_thinking": False}, + }, + headers=headers, + timeout=15, + ) + response.raise_for_status() + result = response.json() + keywords = extract_keywords(result) + if keywords: + return keywords + logging.warning(f"Attempt {attempt + 1}: Empty or invalid response from LLM API for keywords.") + except requests.exceptions.RequestException as e: + logging.warning(f"Attempt {attempt + 1}: LLM API request failed: {e}. Retrying...") + except (KeyError, ValueError, IndexError) as e: + logging.warning(f"Attempt {attempt + 1}: LLM API response parsing failed: {e}. Retrying...") + + logging.warning("All LLM API retries failed for keywords. Using fallback.") + return f"{service_title.replace('-', ' ').title()}" + + def read_rst_content(file_path): """Read and return the content of an RST file.""" with open(file_path, "r", encoding="utf-8") as f: return f.read() -def add_sphinx_metadata(file_path, meta_description): +def add_sphinx_metadata(file_path, meta_description, meta_keywords=None): """Add Sphinx-compatible meta block at the end of an RST file.""" content = read_rst_content(file_path) - meta_block = ( - "\n" - ".. meta::\n" - " :description: {}\n".format(meta_description) - ) + meta_block = "\n.. meta::\n" + if meta_description: + meta_block += " :description: {}\n".format(meta_description) + if meta_keywords: + meta_block += " :keywords: {}\n".format(meta_keywords) # Check if meta block already exists if ".. meta::" in content: @@ -237,7 +319,6 @@ def process_service(args, service): new_branch.checkout() - # Find all RST files in the documentation (doc/, umn/, api-ref/) rst_files = list(repo_dir.rglob("doc/**/*.rst")) + \ list(repo_dir.rglob("umn/**/*.rst")) + \ list(repo_dir.rglob("api-ref/**/*.rst")) @@ -259,10 +340,18 @@ def process_service(args, service): args.llm_username, args.llm_password ) + keywords = generate_keywords_with_llm( + content, + service["service_title"], + args.llm_api_url, + args.llm_model, + args.llm_username, + args.llm_password + ) - if add_sphinx_metadata(rst_file, description): + if add_sphinx_metadata(rst_file, description, keywords): updated_count += 1 - logging.info(f"Added meta description to {rst_file}") + logging.info(f"Added meta description and keywords to {rst_file}") else: processed_count += 1