From 389fee25988213159370b7a4e10f00b136e50751 Mon Sep 17 00:00:00 2001
From: Sebastian Gode <sebastian.gode@telekom.de>
Date: Tue, 3 Mar 2026 09:37:49 +0000
Subject: [PATCH] Script for LLM

---
 tools/generate_meta.py | 373 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 373 insertions(+)
 create mode 100755 tools/generate_meta.py
diff --git a/tools/generate_meta.py b/tools/generate_meta.py
new file mode 100755
index 0000000..20c10e9
--- /dev/null
+++ b/tools/generate_meta.py
@@ -0,0 +1,373 @@
+#!/usr/bin/python
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import base64
+import logging
+import pathlib
+import re
+import requests
+import subprocess
+import sys
+
+from git import exc
+from git import Repo
+
+import otc_metadata.services
+
+data = otc_metadata.services.Services()
+
+api_session = requests.Session()
+
+
+def remove_thinking_content(text):
+    """Remove thinking process content between thinking markers."""
+    # Remove everything between <think> and </think> markers
+    text = re.sub(r'(?is)<think>.*?</think>', '', text, flags=re.DOTALL | re.IGNORECASE)
+    return text.strip()
+
+
+def generate_description_with_llm(text, service_title, llm_api_url, model_name, api_username, api_password):
+    """Generate a meta description using the LLM API."""
+    prompt = (
+        "/no_think\n"
+        f"Generate a concise HTML meta description (maximum 160 characters, minimum 40 characters) "
+        f"for the following documentation content of the service '{service_title}'. "
+        f"The description should be suitable for search engines and summarize the content. "
+        f"Do not include any markdown formatting, quotes, or meta-commentary.\n\n"
+        f"Content:\n{text[:2000]}\n\n"
+        f"Meta description:"
+    )
+
+    try:
+        headers = {"Content-Type": "application/json"}
+        if api_username and api_password:
+            credentials = f"{api_username}:{api_password}"
+            encoded_credentials = base64.b64encode(credentials.encode()).decode()
+            headers["Authorization"] = f"Basic {encoded_credentials}"
+
+        response = requests.post(
+            llm_api_url,
+            json={
+                "prompt": prompt,
+                "model": model_name,
+                "temperature": 0.2,
+
+                "repeat_last_n": 128,
+                "repeat_penalty": 1.15,
+                "presence_penalty": 0.2,
+                "frequency_penalty": 0.2,
+
+                # optional DRY anti-looping (try only if it still loops)
+                "dry_multiplier": 0.5,
+                "dry_base": 1.75,
+                "dry_allowed_length": 2,
+
+            },
+            headers=headers,
+            timeout=15,
+        )
+        response.raise_for_status()
+        result = response.json()
+        if "choices" in result and len(result["choices"]) > 0:
+            description = result["choices"][0]["text"].strip()
+            description = remove_thinking_content(description)
+            # If description is empty or just whitespace, use fallback
+            if not description or description.isspace():
+                return f"{service_title} documentation"
+            # Extract only the first sentence (meta description should be one sentence)
+            parts = description.split(".")
+            first_sentence = parts[0].strip() + "."
+            if len(first_sentence) <= 1:
+                first_sentence = description[:160].strip() + "."
+            if len(first_sentence) > 160:
+                first_sentence = first_sentence[:157] + "..."
+            return first_sentence
+        elif "response" in result:
+            description = result["response"].strip()
+            description = remove_thinking_content(description)
+            if not description or description.isspace():
+                return f"{service_title} documentation"
+            first_sentence = description.split(".")[0].strip() + "."
+            if len(first_sentence) <= 1:
+                first_sentence = description[:160].strip() + "."
+            if len(first_sentence) > 160:
+                first_sentence = first_sentence[:157] + "..."
+            return first_sentence
+        elif isinstance(result, dict) and "text" in result:
+            description = result["text"].strip()
+            description = remove_thinking_content(description)
+            if not description or description.isspace():
+                return f"{service_title} documentation"
+            first_sentence = description.split(".")[0].strip() + "."
+            if len(first_sentence) <= 1:
+                first_sentence = description[:160].strip() + "."
+            if len(first_sentence) > 160:
+                first_sentence = first_sentence[:157] + "..."
+            return first_sentence
+    except requests.exceptions.RequestException as e:
+        logging.warning(f"LLM API request failed: {e}. Using fallback description.")
+    except (KeyError, ValueError, IndexError) as e:
+        logging.warning(f"LLM API response parsing failed: {e}. Using fallback description.")
+
+    # Fallback: Extract first sentence from content
+    lines = text.split("\n")
+    for line in lines:
+        line = line.strip()
+        if line and not line.startswith("-") and not line.startswith("#"):
+            first_sentence = line.split(".")[0] + "."
+            if len(first_sentence) > 160:
+                first_sentence = first_sentence[:157] + "..."
+            return first_sentence
+
+    return f"{service_title} documentation"
+
+
+def read_rst_content(file_path):
+    """Read and return the content of an RST file."""
+    with open(file_path, "r", encoding="utf-8") as f:
+        return f.read()
+
+
+def add_sphinx_metadata(file_path, meta_description):
+    """Add Sphinx-compatible meta block at the end of an RST file."""
+    content = read_rst_content(file_path)
+
+    meta_block = (
+        "\n"
+        ".. meta::\n"
+        "   :description: {}\n".format(meta_description)
+    )
+
+    # Check if meta block already exists
+    if ".. meta::" in content:
+        logging.debug(f"Meta block already exists in {file_path}. Skipping.")
+        return False
+
+    # Append meta block at the end of the file
+    new_content = content.rstrip() + meta_block
+
+    with open(file_path, "w", encoding="utf-8", newline="") as f:
+        f.write(new_content)
+
+    return True
+
+
+def process_service(args, service):
+    """Process a single service and add metadata to its RST files."""
+    logging.debug(f"Processing service {service['service_title']}")
+    workdir = pathlib.Path(args.work_dir)
+    workdir.mkdir(exist_ok=True)
+
+    repo_url = None
+    repo_dir = None
+    git_repo = None
+    error_list = []
+
+    repo = None
+    for r in service["repositories"]:
+        if r["cloud_environments"][0] == args.cloud_environment:
+            repo_dir = workdir / r["type"] / r["repo"]
+
+            if r["environment"] == args.target_environment:
+                repo = r
+                break
+            else:
+                logging.debug(f"Skipping repository {r}")
+                continue
+
+    if not repo_dir:
+        logging.info(f"No repository found for service {service['service_title']}")
+        return
+
+    if repo_dir.exists():
+        logging.debug(f"Repository {repo_dir} already exists")
+        try:
+            git_repo = Repo(repo_dir)
+            git_repo.remotes.origin.fetch()
+            git_repo.heads.main.checkout()
+            git_repo.remotes.origin.pull()
+        except exc.InvalidGitRepositoryError:
+            logging.error("Existing repository checkout is bad")
+            import shutil
+            shutil.rmtree(repo_dir)
+            git_repo = None
+        except Exception as e:
+            error_list.append({"error": e, "repo": repo["repo"]})
+
+    if not repo_dir.exists() or git_repo is None:
+        if repo["type"] == "gitea":
+            repo_url = (
+                f"ssh://git@gitea.eco.tsi-dev.otc-service.com:2222/"
+                f"{repo['repo']}"
+            )
+        elif repo["type"] == "github":
+            repo_url = f"git@github.com:{repo['repo']}"
+        else:
+            logging.error(f"Repository type {repo['type']} is not supported")
+            error_list.append({"error": f"Repository type {repo['type']} is not supported", "repo": repo["repo"]})
+            return
+
+        try:
+            logging.debug(f"Cloning repository {repo_url}")
+            git_repo = Repo.clone_from(repo_url, repo_dir, branch="main")
+        except Exception as e:
+            logging.error(f"Error cloning repository {repo_url}: {e}")
+            error_list.append({"error": f"Error cloning repository {repo_url}", "repo": repo["repo"]})
+            return
+
+    branch_name = f"add-meta-{args.branch_name}"
+
+    try:
+        new_branch = git_repo.create_head(branch_name, "main")
+    except Exception as e:
+        logging.warning(f"Skipping service {service} due to {e}")
+        error_list.append({"error": e, "repo": repo["repo"]})
+        return
+
+    new_branch.checkout()
+
+    # Find all RST files in the documentation (doc/, umn/, api-ref/)
+    rst_files = list(repo_dir.rglob("doc/**/*.rst")) + \
+                list(repo_dir.rglob("umn/**/*.rst")) + \
+                list(repo_dir.rglob("api-ref/**/*.rst"))
+
+    processed_count = 0
+    updated_count = 0
+
+    for rst_file in rst_files:
+
+        logging.debug(f"Analyzing document {rst_file}")
+
+        try:
+            content = read_rst_content(rst_file)
+            description = generate_description_with_llm(
+                content,
+                service["service_title"],
+                args.llm_api_url,
+                args.llm_model,
+                args.llm_username,
+                args.llm_password
+            )
+
+            if add_sphinx_metadata(rst_file, description):
+                updated_count += 1
+                logging.info(f"Added meta description to {rst_file}")
+            else:
+                processed_count += 1
+
+            git_repo.index.add([str(rst_file)])
+
+        except Exception as e:
+            logging.error(f"Error processing {rst_file}: {e}")
+            error_list.append({"error": e, "repo": str(rst_file)})
+
+    if len(git_repo.index.diff("HEAD")) == 0:
+        logging.debug("No changes required for service %s", service["service_type"])
+        return
+
+    git_repo.index.commit(args.commit_description)
+
+    try:
+        git_repo.git.push("--set-upstream", "origin", branch_name)
+        logging.info(f"Pushed changes for service {service['service_title']}")
+    except Exception as e:
+        error_list.append({"error": e, "repo": repo["repo"]})
+
+    if repo_url and "github" in repo_url:
+        subprocess.run(
+            args=["gh", "pr", "create", "-f"], cwd=repo_dir, check=False
+        )
+    elif repo_url and "gitea" in repo_url and args.token:
+        pass
+
+    if len(error_list) != 0:
+        logging.error("The following errors have happened:")
+        logging.error(error_list)
+
+    logging.info(f"Processed {processed_count} files, updated {updated_count} files")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Add Sphinx meta blocks to RST files using LLM-generated descriptions."
+    )
+    parser.add_argument(
+        "--target-environment",
+        required=True,
+        choices=["internal", "public"],
+        help="Environment to be used as a source",
+    )
+    parser.add_argument("--service-type", help="Service to update")
+    parser.add_argument(
+        "--work-dir",
+        required=True,
+        help="Working directory to use for repository checkout.",
+    )
+    parser.add_argument(
+        "--branch-name",
+        default="meta-generation",
+        help="Branch name to be used for changes.",
+    )
+    parser.add_argument("--token", metavar="token", help="API token")
+    parser.add_argument(
+        "--llm-api-url",
+        default="http://localhost:8080/v1/completions",
+        help="URL of the LLM API server. Default: http://localhost:8080/v1/completions",
+    )
+    parser.add_argument(
+        "--llm-model",
+        default="llama2",
+        help="LLM model name to use. Default: llama2",
+    )
+    parser.add_argument(
+        "--llm-username",
+        help="Username for Basic Authentication with LLM server",
+    )
+    parser.add_argument(
+        "--llm-password",
+        help="Password for Basic Authentication with LLM server",
+    )
+    parser.add_argument(
+        "--commit-description",
+        default=(
+            "Add Sphinx meta blocks to RST files\n\n"
+            "Generated by otc-metadata-rework/tools/generate_meta.py"
+        ),
+        help="Commit description for the commit",
+    )
+    parser.add_argument(
+        "--cloud-environment",
+        required=True,
+        default="eu_de",
+        help="Cloud Environment. Default: eu_de",
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    services = []
+    if args.service_type:
+        services = [data.get_service_with_repo_by_service_type(service_type=args.service_type)]
+    else:
+        services = data.services_with_repos()
+
+    for service in services:
+        process_service(args, service)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file