Files
doc-exports/.gitea/workflows/helpers/metadata-check.py

150 lines
4.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Check that newly added HTML files are registered in both metadata files.
This script validates that every new HTML file added in a PR is properly
registered in both CLASS.TXT.json and ALL_META.TXT.json in the same directory.
Directory structure:
docs/vpc/api-ref/
├── CLASS.TXT.json # Contains uri references to HTML files
├── ALL_META.TXT.json # Contains uri references to HTML files
└── en-us_topic_XXXXX.html
Both metadata files use the "uri" field to reference HTML filenames. When a
new HTML file is added, it must be added to both metadata files.
Checks performed:
1. Does CLASS.TXT.json exist in the HTML file's directory?
2. Does ALL_META.TXT.json exist in the HTML file's directory?
3. Is the HTML filename listed in CLASS.TXT.json (in uri field)?
4. Is the HTML filename listed in ALL_META.TXT.json (in uri field)?
Usage:
Set ADDED_FILES environment variable with space-separated list of added HTML files.
Exits with code 1 if violations found, 0 otherwise.
Writes violations to violations.json for comment generation.
"""
import sys
import os
import json
def get_uris_from_metadata(json_path):
"""Extract all uri values from a metadata JSON file."""
uris = set()
try:
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
for entry in data:
uri = entry.get("uri", "")
if uri:
uris.add(uri)
except json.JSONDecodeError as e:
print(f"Warning: Error parsing {json_path}: {e}")
except Exception as e:
print(f"Warning: Error reading {json_path}: {e}")
return uris
def check_html_file_registration(html_file, base_dir):
"""
Check if an HTML file is registered in both metadata files.
Args:
html_file: Full path to the HTML file
base_dir: Base directory of the repository
Returns:
dict with:
- 'file': HTML file path
- 'missing_from': list of metadata files it's missing from
- 'directory': directory containing the HTML file
"""
result = {"file": html_file, "missing_from": [], "directory": ""}
# Get relative path from base directory
rel_path = os.path.relpath(html_file, base_dir)
result["file"] = rel_path
# Get directory and filename
dir_path = os.path.dirname(rel_path)
filename = os.path.basename(rel_path)
result["directory"] = dir_path
# Check if metadata files exist
class_txt_path = os.path.join(dir_path, "CLASS.TXT.json")
all_meta_path = os.path.join(dir_path, "ALL_META.TXT.json")
# Check CLASS.TXT.json
if os.path.exists(class_txt_path):
class_uris = get_uris_from_metadata(class_txt_path)
if filename not in class_uris:
result["missing_from"].append("CLASS.TXT.json")
else:
result["missing_from"].append("CLASS.TXT.json (file not found)")
# Check ALL_META.TXT.json
if os.path.exists(all_meta_path):
meta_uris = get_uris_from_metadata(all_meta_path)
if filename not in meta_uris:
result["missing_from"].append("ALL_META.TXT.json")
else:
result["missing_from"].append("ALL_META.TXT.json (file not found)")
return result
def main():
# Get base directory (repository root)
base_dir = os.getcwd()
# Get changed files from environment
changed_files_str = os.environ.get("ADDED_FILES", "")
added_files = [f.strip() for f in changed_files_str.split() if f.strip()]
if not added_files:
print("No HTML files added in this PR")
sys.exit(0)
print(f"Checking {len(added_files)} added HTML file(s)")
all_violations = []
for html_file in added_files:
# Check if file exists
if not os.path.exists(html_file):
print(f"Warning: File not found: {html_file}")
continue
# Check registration
result = check_html_file_registration(html_file, base_dir)
if result["missing_from"]:
all_violations.append(result)
# Output results
if all_violations:
# Write violations to JSON file for later use
with open("violations.json", "w") as f:
json.dump(all_violations, f, indent=2)
print(f"\nFound {len(all_violations)} HTML file(s) not properly registered:")
for v in all_violations:
print(f" {v['file']}")
for missing in v["missing_from"]:
print(f" ❌ Missing from: {missing}")
sys.exit(1)
else:
print("\nAll added HTML files are properly registered in metadata")
sys.exit(0)
if __name__ == "__main__":
main()