otc-metadata-rework/tools/generate_meta.py

#!/usr/bin/python

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import base64
import logging
import pathlib
import requests
import subprocess

from git import exc
from git import Repo

import otc_metadata.services

data = otc_metadata.services.Services()

api_session = requests.Session()


def extract_description(result):
    """Extract description from API response and clean it."""
    if "choices" in result and len(result["choices"]) > 0:
        message = result["choices"][0].get("message", {})
        description = message.get("content", "")
    elif "response" in result:
        description = result["response"].strip()
    elif isinstance(result, dict) and "text" in result:
        description = result["text"].strip()
    else:
        return None

    description = description.strip()

    if not description or description.isspace():
        return None

    # Extract only the first sentence
    parts = description.split(".")
    first_sentence = parts[0].strip() + "."
    if len(first_sentence) <= 1:
        first_sentence = description[:160].strip() + "."
    if len(first_sentence) > 160:
        first_sentence = first_sentence[:157] + "..."
    return first_sentence


def extract_keywords(result):
    """Extract keywords from API response and clean it."""
    if "choices" in result and len(result["choices"]) > 0:
        message = result["choices"][0].get("message", {})
        keywords_text = message.get("content", "")
    elif "response" in result:
        keywords_text = result["response"].strip()
    elif isinstance(result, dict) and "text" in result:
        keywords_text = result["text"].strip()
    else:
        return None

    keywords_text = keywords_text.strip()

    if not keywords_text or keywords_text.isspace():
        return None

    keywords = [kw.strip() for kw in keywords_text.split(",")]
    keywords = [kw for kw in keywords if kw and len(kw) > 0]

    keywords = keywords[:5]

    return ", ".join(keywords)


def generate_description_with_llm(text, service_title, llm_api_url, model_name, api_username, api_password):
    """Generate a meta description using the llama.cpp /completion endpoint with up to 3 retries."""
    content_preview = text[:2000].replace("\n", " ")
    prompt = (
        f"Generate a meta description (40-160 chars) for the T Cloud Public service: {service_title}."
        f"This is a service from the cloud provider called 'T Cloud Public', do NOT mention other Cloud Providers or services from them."
        f"NEVER mention AWS, Azure, Huawei, Alibaba, GCP and similiar cloud providers."
        f"Content preview: {content_preview}."
        f"Output ONLY the description text, nothing else."
    )

    headers = {"Content-Type": "application/json"}
    if api_username and api_password:
        credentials = f"{api_username}:{api_password}"
        encoded_credentials = base64.b64encode(credentials.encode()).decode()
        headers["Authorization"] = f"Basic {encoded_credentials}"

    for attempt in range(3):
        try:
            response = requests.post(
                llm_api_url,
                json={
                    "messages": [
                        {"role": "user", "content": prompt},
                    ],
                    "model": model_name,
                    "temperature": 0.5,

                    "top_k": 40,
                    "top_p": 0.9,
                    "min_p": 0.05,

                    "repeat_last_n": 256,
                    "repeat_penalty": 1.18,
                    "presence_penalty": 0.2,
                    "frequency_penalty": 0.2,

                    "dry_multiplier": 0.8,
                    "dry_base": 1.75,
                    "dry_allowed_length": 2,
                    "dry_penalty_last_n": -1,
                    "chat_template_kwargs": {"enable_thinking": False},
                },
                headers=headers,
                timeout=15,
            )
            response.raise_for_status()
            result = response.json()
            description = extract_description(result)
            if description:
                return description
            logging.warning(f"Attempt {attempt + 1}: Empty or invalid response from LLM API.")
        except requests.exceptions.RequestException as e:
            logging.warning(f"Attempt {attempt + 1}: LLM API request failed: {e}. Retrying...")
        except (KeyError, ValueError, IndexError) as e:
            logging.warning(f"Attempt {attempt + 1}: LLM API response parsing failed: {e}. Retrying...")

    # After all retries failed, use fallback - extract first headline
    logging.warning("All LLM API retries failed. Using fallback description from first headline.")
    lines = text.split("\n")
    for i, line in enumerate(lines):
        line_stripped = line.strip()
        if line_stripped and not line_stripped.startswith("-") and not line_stripped.startswith("#"):
            # Check if next line is a headline underline (=== or ---)
            if i + 1 < len(lines):
                next_line = lines[i + 1].strip()
                if next_line and all(c in "=-" for c in next_line):
                    description = line_stripped
                    if len(description) > 160:
                        description = description[:157] + "..."
                    return description

    return f"{service_title} documentation"


def generate_keywords_with_llm(text, service_title, llm_api_url, model_name, api_username, api_password):
    """Generate keywords using the llama.cpp /completion endpoint with up to 3 retries."""
    content_preview = text[:2000].replace("\n", " ")
    prompt = (
        f"Generate up to 5 keywords (comma-separated) for the T Cloud Public service: {service_title}. "
        f"This is a service from the cloud provider called 'T Cloud Public', do NOT mention other Cloud Providers or services from them."
        f"NEVER mention AWS, Azure, Huawei, Alibaba, GCP and similiar cloud providers."
        f"Content preview: {content_preview}. "
        f"Output ONLY comma-separated keywords, nothing else."
    )

    headers = {"Content-Type": "application/json"}
    if api_username and api_password:
        credentials = f"{api_username}:{api_password}"
        encoded_credentials = base64.b64encode(credentials.encode()).decode()
        headers["Authorization"] = f"Basic {encoded_credentials}"

    for attempt in range(3):
        try:
            response = requests.post(
                llm_api_url,
                json={
                    "messages": [
                        {"role": "user", "content": prompt},
                    ],
                    "model": model_name,
                    "temperature": 0.7,

                    "top_k": 40,
                    "top_p": 0.9,
                    "min_p": 0.05,

                    "repeat_last_n": 256,
                    "repeat_penalty": 1.18,
                    "presence_penalty": 0.2,
                    "frequency_penalty": 0.2,

                    "dry_multiplier": 0.8,
                    "dry_base": 1.75,
                    "dry_allowed_length": 2,
                    "dry_penalty_last_n": -1,
                    "chat_template_kwargs": {"enable_thinking": False},
                },
                headers=headers,
                timeout=15,
            )
            response.raise_for_status()
            result = response.json()
            keywords = extract_keywords(result)
            if keywords:
                return keywords
            logging.warning(f"Attempt {attempt + 1}: Empty or invalid response from LLM API for keywords.")
        except requests.exceptions.RequestException as e:
            logging.warning(f"Attempt {attempt + 1}: LLM API request failed: {e}. Retrying...")
        except (KeyError, ValueError, IndexError) as e:
            logging.warning(f"Attempt {attempt + 1}: LLM API response parsing failed: {e}. Retrying...")

    logging.warning("All LLM API retries failed for keywords. Using fallback.")
    return f"{service_title.replace('-', ' ').title()}"


def read_rst_content(file_path):
    """Read and return the content of an RST file."""
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()


def add_sphinx_metadata(file_path, meta_description, meta_keywords=None):
    """Add Sphinx-compatible meta block at the end of an RST file."""
    content = read_rst_content(file_path)

    meta_block = "\n\n.. meta::\n"
    if meta_description:
        meta_block += "   :description: {}\n".format(meta_description)
    if meta_keywords:
        meta_block += "   :keywords: {}\n".format(meta_keywords)

    # Check if meta block already exists
    if ".. meta::" in content:
        logging.debug(f"Meta block already exists in {file_path}. Skipping.")
        return False

    # Append meta block at the end of the file
    new_content = content.rstrip() + meta_block

    with open(file_path, "w", encoding="utf-8", newline="") as f:
        f.write(new_content)

    return True


def process_service(args, service):
    """Process a single service and add metadata to its RST files."""
    logging.debug(f"Processing service {service['service_title']}")
    workdir = pathlib.Path(args.work_dir)
    workdir.mkdir(exist_ok=True)

    repo_url = None
    repo_dir = None
    git_repo = None
    error_list = []
    files_updated = 0

    repo = None
    for r in service["repositories"]:
        if r["cloud_environments"][0] == args.cloud_environment:
            repo_dir = workdir / r["type"] / r["repo"]

            if r["environment"] == args.target_environment:
                repo = r
                break
            else:
                logging.debug(f"Skipping repository {r}")
                continue

    if not repo_dir:
        logging.info(f"No repository found for service {service['service_title']}")
        return

    if repo_dir.exists():
        logging.debug(f"Repository {repo_dir} already exists")
        try:
            git_repo = Repo(repo_dir)
            git_repo.remotes.origin.fetch()
            git_repo.heads.main.checkout()
            git_repo.remotes.origin.pull()
        except exc.InvalidGitRepositoryError:
            logging.error("Existing repository checkout is bad")
            import shutil
            shutil.rmtree(repo_dir)
            git_repo = None
        except Exception as e:
            error_list.append({"error": e, "repo": repo["repo"]})

    if not repo_dir.exists() or git_repo is None:
        if repo["type"] == "gitea":
            repo_url = (
                f"ssh://git@gitea.eco.tsi-dev.otc-service.com:2222/"
                f"{repo['repo']}"
            )
        elif repo["type"] == "github":
            repo_url = f"git@github.com:{repo['repo']}"
        else:
            logging.error(f"Repository type {repo['type']} is not supported")
            error_list.append({"error": f"Repository type {repo['type']} is not supported", "repo": repo["repo"]})
            return

        try:
            logging.debug(f"Cloning repository {repo_url}")
            git_repo = Repo.clone_from(repo_url, repo_dir, branch="main")
        except Exception as e:
            logging.error(f"Error cloning repository {repo_url}: {e}")
            error_list.append({"error": f"Error cloning repository {repo_url}", "repo": repo["repo"]})
            return

    branch_name = f"add-meta-{args.branch_name}"

    try:
        new_branch = git_repo.create_head(branch_name, "main")
    except Exception as e:
        logging.warning(f"Skipping service {service} due to {e}")
        error_list.append({"error": e, "repo": repo["repo"]})
        return

    new_branch.checkout()

    rst_files = (list(repo_dir.rglob("doc/**/*.rst"))
                 + list(repo_dir.rglob("umn/**/*.rst"))
                 + list(repo_dir.rglob("api-ref/**/*.rst")))

    processed_count = 0
    updated_count = 0

    for rst_file in rst_files:

        logging.debug(f"Analyzing document {rst_file}")

        try:
            content = read_rst_content(rst_file)
            description = generate_description_with_llm(
                content,
                service["service_title"],
                args.llm_api_url,
                args.llm_model,
                args.llm_username,
                args.llm_password
            )
            keywords = generate_keywords_with_llm(
                content,
                service["service_title"],
                args.llm_api_url,
                args.llm_model,
                args.llm_username,
                args.llm_password
            )

            if add_sphinx_metadata(rst_file, description, keywords):
                updated_count += 1
                files_updated += 1
                logging.info(f"Added meta description and keywords to {rst_file}")
            else:
                processed_count += 1

            git_repo.index.add([str(rst_file)])

        except Exception as e:
            logging.error(f"Error processing {rst_file}: {e}")
            error_list.append({"error": e, "repo": str(rst_file)})

    if len(git_repo.index.diff("HEAD")) == 0:
        logging.debug("No changes required for service %s", service["service_type"])
        return

    git_repo.index.commit(args.commit_description)

    try:
        git_repo.git.push("--set-upstream", "origin", branch_name)
        logging.info(f"Pushed changes for service {service['service_title']}")
    except Exception as e:
        error_list.append({"error": e, "repo": repo["repo"]})

    if repo_url and "github" in repo_url:
        subprocess.run(
            args=["gh", "pr", "create", "-f"], cwd=repo_dir, check=False
        )
    elif repo_url and "gitea" in repo_url and args.token:
        pass

    if len(error_list) != 0:
        logging.error("The following errors have happened:")
        logging.error(error_list)

    logging.info(f"Processed {processed_count} files, updated {updated_count} files")

    return files_updated


def main():
    parser = argparse.ArgumentParser(
        description="Add Sphinx meta blocks to RST files using LLM-generated descriptions."
    )
    parser.add_argument(
        "--target-environment",
        required=True,
        choices=["internal", "public"],
        help="Environment to be used as a source",
    )
    parser.add_argument("--service-type", help="Service to update")
    parser.add_argument(
        "--work-dir",
        required=True,
        help="Working directory to use for repository checkout.",
    )
    parser.add_argument(
        "--branch-name",
        default="meta-generation",
        help="Branch name to be used for changes.",
    )
    parser.add_argument("--token", metavar="token", help="API token")
    parser.add_argument(
        "--llm-api-url",
        default="http://localhost:8080/v1/chat/completions",
        help="URL of the LLM API server. Default: http://localhost:8080/v1/chat/completions",
    )
    parser.add_argument(
        "--llm-model",
        default="llama2",
        help="LLM model name to use. Default: llama2",
    )
    parser.add_argument(
        "--llm-username",
        help="Username for Basic Authentication with LLM server",
    )
    parser.add_argument(
        "--llm-password",
        help="Password for Basic Authentication with LLM server",
    )
    parser.add_argument(
        "--commit-description",
        default=(
            "Add Sphinx meta blocks to RST files\n\n"
            "Generated by otc-metadata-rework/tools/generate_meta.py"
        ),
        help="Commit description for the commit",
    )
    parser.add_argument(
        "--cloud-environment",
        required=True,
        default="eu_de",
        help="Cloud Environment. Default: eu_de",
    )

    args = parser.parse_args()

    logging.basicConfig(level=logging.DEBUG)

    services = []
    if args.service_type:
        services = [data.get_service_with_repo_by_service_type(service_type=args.service_type)]
    else:
        services = data.services_with_repos()

    total_files_updated = 0
    services_with_updates = []

    for service in services:
        files_updated = process_service(args, service)
        if files_updated > 0:
            total_files_updated += files_updated
            services_with_updates.append((service['service_title'], files_updated))

    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Total files with metadata added: {total_files_updated}")
    print(f"Services with updates: {len(services_with_updates)}")
    print("\nServices with metadata added:")
    for service_title, count in services_with_updates:
        print(f"  - {service_title}: {count} file(s)")
    print("=" * 60)


if __name__ == "__main__":
    main()