Files
doc-exports/.gitea/workflows/helpers/class-txt-check.py

162 lines
5.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Check CLASS.TXT.json files for duplicate titles under the same parent document.
This script validates that child documents under the same parent (p_code) have
unique titles. Comparison is case-insensitive, so "Creating an ECS" and
"creating an ecs" are considered duplicates.
JSON structure:
- Each entry has: code (document ID), p_code (parent document ID), title
- Documents with same p_code are siblings under the same parent
- Siblings must have unique titles (case-insensitive)
Example violation:
Parent (code="3", title="Virtual Private Cloud")
- Child (code="4", p_code="3", title="Creating a VPC")
- Child (code="5", p_code="3", title="creating a vpc") <- DUPLICATE!
Usage:
Set CHANGED_FILES environment variable with space-separated list of CLASS.TXT.json files.
Exits with code 1 if violations found, 0 otherwise.
Writes violations to violations.json for comment generation.
"""
import sys
import os
import json
def check_duplicate_titles(json_path):
"""
Check for duplicate titles under the same parent.
Returns list of violations:
[
{
'file': 'path/to/CLASS.TXT.json',
'parent_code': '3',
'parent_title': 'Virtual Private Cloud',
'duplicate_title': 'Creating a VPC',
'codes': ['4', '6']
}
]
"""
violations = []
# Load the JSON file
try:
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
except json.JSONDecodeError as e:
print(f"Error parsing JSON in {json_path}: {e}")
return []
except Exception as e:
print(f"Error reading {json_path}: {e}")
return []
# Build a code-to-title map for looking up parent titles
code_to_title = {}
for entry in data:
code = entry.get("code", "")
title = entry.get("title", "")
if code and title:
code_to_title[code] = title
# Group documents by p_code (parent code)
parent_groups = {}
for entry in data:
p_code = entry.get("p_code", "")
code = entry.get("code", "")
title = entry.get("title", "")
# Skip if missing required fields
if not code or not title:
continue
# Group by parent code
if p_code not in parent_groups:
parent_groups[p_code] = []
parent_groups[p_code].append({"code": code, "title": title})
# Check each parent group for duplicate titles
for p_code, children in parent_groups.items():
# Group children by title (case-insensitive)
title_groups = {}
for child in children:
title_lower = child["title"].lower().strip()
if title_lower not in title_groups:
title_groups[title_lower] = []
title_groups[title_lower].append(child)
# Find duplicates (titles that appear more than once)
for title_lower, docs in title_groups.items():
if len(docs) > 1:
# Get the original title (from first occurrence)
original_title = docs[0]["title"]
# Get parent title
parent_title = code_to_title.get(
p_code, "(root)" if p_code == "" else f"(unknown parent: {p_code})"
)
# Collect document codes
codes = [doc["code"] for doc in docs]
violations.append(
{
"file": json_path,
"parent_code": p_code if p_code else "(root)",
"parent_title": parent_title,
"duplicate_title": original_title,
"codes": codes,
}
)
return violations
def main():
# Get changed files from environment
changed_files_str = os.environ.get("CHANGED_FILES", "")
changed_files = [f.strip() for f in changed_files_str.split() if f.strip()]
if not changed_files:
print("No CLASS.TXT.json files changed in this PR")
sys.exit(0)
print(f"Checking {len(changed_files)} CLASS.TXT.json file(s): {changed_files}")
all_violations = []
for filepath in changed_files:
if not os.path.exists(filepath):
print(f"Warning: File not found: {filepath}")
continue
violations = check_duplicate_titles(filepath)
all_violations.extend(violations)
# Output results
if all_violations:
# Write violations to JSON file for later use
with open("violations.json", "w") as f:
json.dump(all_violations, f, indent=2)
print(f"\nFound {len(all_violations)} violation(s):")
for v in all_violations:
print(f" {v['file']}")
print(f" Parent: {v['parent_title']} (code: {v['parent_code']})")
print(f' Duplicate title: "{v["duplicate_title"]}"')
print(f" Document codes: {', '.join(v['codes'])}")
sys.exit(1)
else:
print("\nNo violations found")
sys.exit(0)
if __name__ == "__main__":
main()