forked from docs/doc-exports
162 lines
5.0 KiB
Python
Executable File
162 lines
5.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Check CLASS.TXT.json files for duplicate titles under the same parent document.
|
|
|
|
This script validates that child documents under the same parent (p_code) have
|
|
unique titles. Comparison is case-insensitive, so "Creating an ECS" and
|
|
"creating an ecs" are considered duplicates.
|
|
|
|
JSON structure:
|
|
- Each entry has: code (document ID), p_code (parent document ID), title
|
|
- Documents with same p_code are siblings under the same parent
|
|
- Siblings must have unique titles (case-insensitive)
|
|
|
|
Example violation:
|
|
Parent (code="3", title="Virtual Private Cloud")
|
|
- Child (code="4", p_code="3", title="Creating a VPC")
|
|
- Child (code="5", p_code="3", title="creating a vpc") <- DUPLICATE!
|
|
|
|
Usage:
|
|
Set CHANGED_FILES environment variable with space-separated list of CLASS.TXT.json files.
|
|
Exits with code 1 if violations found, 0 otherwise.
|
|
Writes violations to violations.json for comment generation.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import json
|
|
|
|
|
|
def check_duplicate_titles(json_path):
|
|
"""
|
|
Check for duplicate titles under the same parent.
|
|
|
|
Returns list of violations:
|
|
[
|
|
{
|
|
'file': 'path/to/CLASS.TXT.json',
|
|
'parent_code': '3',
|
|
'parent_title': 'Virtual Private Cloud',
|
|
'duplicate_title': 'Creating a VPC',
|
|
'codes': ['4', '6']
|
|
}
|
|
]
|
|
"""
|
|
violations = []
|
|
|
|
# Load the JSON file
|
|
try:
|
|
with open(json_path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
except json.JSONDecodeError as e:
|
|
print(f"Error parsing JSON in {json_path}: {e}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"Error reading {json_path}: {e}")
|
|
return []
|
|
|
|
# Build a code-to-title map for looking up parent titles
|
|
code_to_title = {}
|
|
for entry in data:
|
|
code = entry.get("code", "")
|
|
title = entry.get("title", "")
|
|
if code and title:
|
|
code_to_title[code] = title
|
|
|
|
# Group documents by p_code (parent code)
|
|
parent_groups = {}
|
|
for entry in data:
|
|
p_code = entry.get("p_code", "")
|
|
code = entry.get("code", "")
|
|
title = entry.get("title", "")
|
|
|
|
# Skip if missing required fields
|
|
if not code or not title:
|
|
continue
|
|
|
|
# Group by parent code
|
|
if p_code not in parent_groups:
|
|
parent_groups[p_code] = []
|
|
|
|
parent_groups[p_code].append({"code": code, "title": title})
|
|
|
|
# Check each parent group for duplicate titles
|
|
for p_code, children in parent_groups.items():
|
|
# Group children by title (case-insensitive)
|
|
title_groups = {}
|
|
for child in children:
|
|
title_lower = child["title"].lower().strip()
|
|
if title_lower not in title_groups:
|
|
title_groups[title_lower] = []
|
|
title_groups[title_lower].append(child)
|
|
|
|
# Find duplicates (titles that appear more than once)
|
|
for title_lower, docs in title_groups.items():
|
|
if len(docs) > 1:
|
|
# Get the original title (from first occurrence)
|
|
original_title = docs[0]["title"]
|
|
|
|
# Get parent title
|
|
parent_title = code_to_title.get(
|
|
p_code, "(root)" if p_code == "" else f"(unknown parent: {p_code})"
|
|
)
|
|
|
|
# Collect document codes
|
|
codes = [doc["code"] for doc in docs]
|
|
|
|
violations.append(
|
|
{
|
|
"file": json_path,
|
|
"parent_code": p_code if p_code else "(root)",
|
|
"parent_title": parent_title,
|
|
"duplicate_title": original_title,
|
|
"codes": codes,
|
|
}
|
|
)
|
|
|
|
return violations
|
|
|
|
|
|
def main():
|
|
# Get changed files from environment
|
|
changed_files_str = os.environ.get("CHANGED_FILES", "")
|
|
changed_files = [f.strip() for f in changed_files_str.split() if f.strip()]
|
|
|
|
if not changed_files:
|
|
print("No CLASS.TXT.json files changed in this PR")
|
|
sys.exit(0)
|
|
|
|
print(f"Checking {len(changed_files)} CLASS.TXT.json file(s): {changed_files}")
|
|
|
|
all_violations = []
|
|
|
|
for filepath in changed_files:
|
|
if not os.path.exists(filepath):
|
|
print(f"Warning: File not found: {filepath}")
|
|
continue
|
|
|
|
violations = check_duplicate_titles(filepath)
|
|
all_violations.extend(violations)
|
|
|
|
# Output results
|
|
if all_violations:
|
|
# Write violations to JSON file for later use
|
|
with open("violations.json", "w") as f:
|
|
json.dump(all_violations, f, indent=2)
|
|
|
|
print(f"\nFound {len(all_violations)} violation(s):")
|
|
for v in all_violations:
|
|
print(f" {v['file']}")
|
|
print(f" Parent: {v['parent_title']} (code: {v['parent_code']})")
|
|
print(f' Duplicate title: "{v["duplicate_title"]}"')
|
|
print(f" Document codes: {', '.join(v['codes'])}")
|
|
|
|
sys.exit(1)
|
|
else:
|
|
print("\nNo violations found")
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|