Files
otc-metadata-rework/tools/generate_meta.py
Sebastian Gode d1ab43d02e
Some checks failed
Run Tox Check / tox-py312 (pull_request) Successful in 16s
Run Tox Check / tox-pep8 (pull_request) Failing after 16s
prompt fix
2026-03-04 13:43:01 +00:00

484 lines
17 KiB
Python
Executable File

#!/usr/bin/python
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import base64
import logging
import pathlib
import requests
import subprocess
from git import exc
from git import Repo
import otc_metadata.services
data = otc_metadata.services.Services()
api_session = requests.Session()
def extract_description(result):
"""Extract description from API response and clean it."""
if "choices" in result and len(result["choices"]) > 0:
message = result["choices"][0].get("message", {})
description = message.get("content", "")
elif "response" in result:
description = result["response"].strip()
elif isinstance(result, dict) and "text" in result:
description = result["text"].strip()
else:
return None
description = description.strip()
if not description or description.isspace():
return None
# Extract only the first sentence
parts = description.split(".")
first_sentence = parts[0].strip() + "."
if len(first_sentence) <= 1:
first_sentence = description[:160].strip() + "."
if len(first_sentence) > 160:
first_sentence = first_sentence[:157] + "..."
return first_sentence
def extract_keywords(result):
"""Extract keywords from API response and clean it."""
if "choices" in result and len(result["choices"]) > 0:
message = result["choices"][0].get("message", {})
keywords_text = message.get("content", "")
elif "response" in result:
keywords_text = result["response"].strip()
elif isinstance(result, dict) and "text" in result:
keywords_text = result["text"].strip()
else:
return None
keywords_text = keywords_text.strip()
if not keywords_text or keywords_text.isspace():
return None
keywords = [kw.strip() for kw in keywords_text.split(",")]
keywords = [kw for kw in keywords if kw and len(kw) > 0]
keywords = keywords[:5]
return ", ".join(keywords)
def generate_description_with_llm(text, service_title, llm_api_url, model_name, api_username, api_password):
"""Generate a meta description using the llama.cpp /completion endpoint with up to 3 retries."""
content_preview = text[:2000].replace("\n", " ")
prompt = (
f"Generate a meta description (40-160 chars) for the T Cloud Public service: {service_title}."
f"This is a service from the cloud provider called 'T Cloud Public', do NOT mention other Cloud Providers or services from them."
f"NEVER mention AWS, Azure, Huawei, Alibaba, GCP and similiar cloud providers."
f"Content preview: {content_preview}."
f"Output ONLY the description text, nothing else."
)
headers = {"Content-Type": "application/json"}
if api_username and api_password:
credentials = f"{api_username}:{api_password}"
encoded_credentials = base64.b64encode(credentials.encode()).decode()
headers["Authorization"] = f"Basic {encoded_credentials}"
for attempt in range(3):
try:
response = requests.post(
llm_api_url,
json={
"messages": [
{"role": "user", "content": prompt},
],
"model": model_name,
"temperature": 0.5,
"top_k": 40,
"top_p": 0.9,
"min_p": 0.05,
"repeat_last_n": 256,
"repeat_penalty": 1.18,
"presence_penalty": 0.2,
"frequency_penalty": 0.2,
"dry_multiplier": 0.8,
"dry_base": 1.75,
"dry_allowed_length": 2,
"dry_penalty_last_n": -1,
"chat_template_kwargs": {"enable_thinking": False},
},
headers=headers,
timeout=15,
)
response.raise_for_status()
result = response.json()
description = extract_description(result)
if description:
return description
logging.warning(f"Attempt {attempt + 1}: Empty or invalid response from LLM API.")
except requests.exceptions.RequestException as e:
logging.warning(f"Attempt {attempt + 1}: LLM API request failed: {e}. Retrying...")
except (KeyError, ValueError, IndexError) as e:
logging.warning(f"Attempt {attempt + 1}: LLM API response parsing failed: {e}. Retrying...")
# After all retries failed, use fallback - extract first headline
logging.warning("All LLM API retries failed. Using fallback description from first headline.")
lines = text.split("\n")
for i, line in enumerate(lines):
line_stripped = line.strip()
if line_stripped and not line_stripped.startswith("-") and not line_stripped.startswith("#"):
# Check if next line is a headline underline (=== or ---)
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
if next_line and all(c in "=-" for c in next_line):
description = line_stripped
if len(description) > 160:
description = description[:157] + "..."
return description
return f"{service_title} documentation"
def generate_keywords_with_llm(text, service_title, llm_api_url, model_name, api_username, api_password):
"""Generate keywords using the llama.cpp /completion endpoint with up to 3 retries."""
content_preview = text[:2000].replace("\n", " ")
prompt = (
f"Generate up to 5 keywords (comma-separated) for the T Cloud Public service: {service_title}. "
f"This is a service from the cloud provider called 'T Cloud Public', do NOT mention other Cloud Providers or services from them."
f"NEVER mention AWS, Azure, Huawei, Alibaba, GCP and similiar cloud providers."
f"Content preview: {content_preview}. "
f"Output ONLY comma-separated keywords, nothing else."
)
headers = {"Content-Type": "application/json"}
if api_username and api_password:
credentials = f"{api_username}:{api_password}"
encoded_credentials = base64.b64encode(credentials.encode()).decode()
headers["Authorization"] = f"Basic {encoded_credentials}"
for attempt in range(3):
try:
response = requests.post(
llm_api_url,
json={
"messages": [
{"role": "user", "content": prompt},
],
"model": model_name,
"temperature": 0.7,
"top_k": 40,
"top_p": 0.9,
"min_p": 0.05,
"repeat_last_n": 256,
"repeat_penalty": 1.18,
"presence_penalty": 0.2,
"frequency_penalty": 0.2,
"dry_multiplier": 0.8,
"dry_base": 1.75,
"dry_allowed_length": 2,
"dry_penalty_last_n": -1,
"chat_template_kwargs": {"enable_thinking": False},
},
headers=headers,
timeout=15,
)
response.raise_for_status()
result = response.json()
keywords = extract_keywords(result)
if keywords:
return keywords
logging.warning(f"Attempt {attempt + 1}: Empty or invalid response from LLM API for keywords.")
except requests.exceptions.RequestException as e:
logging.warning(f"Attempt {attempt + 1}: LLM API request failed: {e}. Retrying...")
except (KeyError, ValueError, IndexError) as e:
logging.warning(f"Attempt {attempt + 1}: LLM API response parsing failed: {e}. Retrying...")
logging.warning("All LLM API retries failed for keywords. Using fallback.")
return f"{service_title.replace('-', ' ').title()}"
def read_rst_content(file_path):
"""Read and return the content of an RST file."""
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
def add_sphinx_metadata(file_path, meta_description, meta_keywords=None):
"""Add Sphinx-compatible meta block at the end of an RST file."""
content = read_rst_content(file_path)
meta_block = "\n\n.. meta::\n"
if meta_description:
meta_block += " :description: {}\n".format(meta_description)
if meta_keywords:
meta_block += " :keywords: {}\n".format(meta_keywords)
# Check if meta block already exists
if ".. meta::" in content:
logging.debug(f"Meta block already exists in {file_path}. Skipping.")
return False
# Append meta block at the end of the file
new_content = content.rstrip() + meta_block
with open(file_path, "w", encoding="utf-8", newline="") as f:
f.write(new_content)
return True
def process_service(args, service):
"""Process a single service and add metadata to its RST files."""
logging.debug(f"Processing service {service['service_title']}")
workdir = pathlib.Path(args.work_dir)
workdir.mkdir(exist_ok=True)
repo_url = None
repo_dir = None
git_repo = None
error_list = []
files_updated = 0
repo = None
for r in service["repositories"]:
if r["cloud_environments"][0] == args.cloud_environment:
repo_dir = workdir / r["type"] / r["repo"]
if r["environment"] == args.target_environment:
repo = r
break
else:
logging.debug(f"Skipping repository {r}")
continue
if not repo_dir:
logging.info(f"No repository found for service {service['service_title']}")
return
if repo_dir.exists():
logging.debug(f"Repository {repo_dir} already exists")
try:
git_repo = Repo(repo_dir)
git_repo.remotes.origin.fetch()
git_repo.heads.main.checkout()
git_repo.remotes.origin.pull()
except exc.InvalidGitRepositoryError:
logging.error("Existing repository checkout is bad")
import shutil
shutil.rmtree(repo_dir)
git_repo = None
except Exception as e:
error_list.append({"error": e, "repo": repo["repo"]})
if not repo_dir.exists() or git_repo is None:
if repo["type"] == "gitea":
repo_url = (
f"ssh://git@gitea.eco.tsi-dev.otc-service.com:2222/"
f"{repo['repo']}"
)
elif repo["type"] == "github":
repo_url = f"git@github.com:{repo['repo']}"
else:
logging.error(f"Repository type {repo['type']} is not supported")
error_list.append({"error": f"Repository type {repo['type']} is not supported", "repo": repo["repo"]})
return
try:
logging.debug(f"Cloning repository {repo_url}")
git_repo = Repo.clone_from(repo_url, repo_dir, branch="main")
except Exception as e:
logging.error(f"Error cloning repository {repo_url}: {e}")
error_list.append({"error": f"Error cloning repository {repo_url}", "repo": repo["repo"]})
return
branch_name = f"add-meta-{args.branch_name}"
try:
new_branch = git_repo.create_head(branch_name, "main")
except Exception as e:
logging.warning(f"Skipping service {service} due to {e}")
error_list.append({"error": e, "repo": repo["repo"]})
return
new_branch.checkout()
rst_files = (list(repo_dir.rglob("doc/**/*.rst"))
+ list(repo_dir.rglob("umn/**/*.rst"))
+ list(repo_dir.rglob("api-ref/**/*.rst")))
processed_count = 0
updated_count = 0
for rst_file in rst_files:
logging.debug(f"Analyzing document {rst_file}")
try:
content = read_rst_content(rst_file)
description = generate_description_with_llm(
content,
service["service_title"],
args.llm_api_url,
args.llm_model,
args.llm_username,
args.llm_password
)
keywords = generate_keywords_with_llm(
content,
service["service_title"],
args.llm_api_url,
args.llm_model,
args.llm_username,
args.llm_password
)
if add_sphinx_metadata(rst_file, description, keywords):
updated_count += 1
files_updated += 1
logging.info(f"Added meta description and keywords to {rst_file}")
else:
processed_count += 1
git_repo.index.add([str(rst_file)])
except Exception as e:
logging.error(f"Error processing {rst_file}: {e}")
error_list.append({"error": e, "repo": str(rst_file)})
if len(git_repo.index.diff("HEAD")) == 0:
logging.debug("No changes required for service %s", service["service_type"])
return
git_repo.index.commit(args.commit_description)
try:
git_repo.git.push("--set-upstream", "origin", branch_name)
logging.info(f"Pushed changes for service {service['service_title']}")
except Exception as e:
error_list.append({"error": e, "repo": repo["repo"]})
if repo_url and "github" in repo_url:
subprocess.run(
args=["gh", "pr", "create", "-f"], cwd=repo_dir, check=False
)
elif repo_url and "gitea" in repo_url and args.token:
pass
if len(error_list) != 0:
logging.error("The following errors have happened:")
logging.error(error_list)
logging.info(f"Processed {processed_count} files, updated {updated_count} files")
return files_updated
def main():
parser = argparse.ArgumentParser(
description="Add Sphinx meta blocks to RST files using LLM-generated descriptions."
)
parser.add_argument(
"--target-environment",
required=True,
choices=["internal", "public"],
help="Environment to be used as a source",
)
parser.add_argument("--service-type", help="Service to update")
parser.add_argument(
"--work-dir",
required=True,
help="Working directory to use for repository checkout.",
)
parser.add_argument(
"--branch-name",
default="meta-generation",
help="Branch name to be used for changes.",
)
parser.add_argument("--token", metavar="token", help="API token")
parser.add_argument(
"--llm-api-url",
default="http://localhost:8080/v1/chat/completions",
help="URL of the LLM API server. Default: http://localhost:8080/v1/chat/completions",
)
parser.add_argument(
"--llm-model",
default="llama2",
help="LLM model name to use. Default: llama2",
)
parser.add_argument(
"--llm-username",
help="Username for Basic Authentication with LLM server",
)
parser.add_argument(
"--llm-password",
help="Password for Basic Authentication with LLM server",
)
parser.add_argument(
"--commit-description",
default=(
"Add Sphinx meta blocks to RST files\n\n"
"Generated by otc-metadata-rework/tools/generate_meta.py"
),
help="Commit description for the commit",
)
parser.add_argument(
"--cloud-environment",
required=True,
default="eu_de",
help="Cloud Environment. Default: eu_de",
)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
services = []
if args.service_type:
services = [data.get_service_with_repo_by_service_type(service_type=args.service_type)]
else:
services = data.services_with_repos()
total_files_updated = 0
services_with_updates = []
for service in services:
files_updated = process_service(args, service)
if files_updated > 0:
total_files_updated += files_updated
services_with_updates.append((service['service_title'], files_updated))
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Total files with metadata added: {total_files_updated}")
print(f"Services with updates: {len(services_with_updates)}")
print("\nServices with metadata added:")
for service_title, count in services_with_updates:
print(f" - {service_title}: {count} file(s)")
print("=" * 60)
if __name__ == "__main__":
main()