diff --git a/.gitea/workflows/helpers/class-comment.py b/.gitea/workflows/helpers/class-comment.py index a4bfbb4de..17682d468 100755 --- a/.gitea/workflows/helpers/class-comment.py +++ b/.gitea/workflows/helpers/class-comment.py @@ -1,5 +1,18 @@ #!/usr/bin/env python3 -"""Generate PR comment from CLASS.TXT.json violations.""" +""" +Generate PR comment for CLASS.TXT.json duplicate title violations. + +This script reads violations.json (created by class-txt-check.py) and generates +a formatted markdown comment to be posted on the PR. The comment includes: + - File path where violations were found + - Parent document title and code + - The duplicate title + - Document codes that share the duplicate title + +Usage: + Run after class-txt-check.py fails. Reads violations.json and outputs JSON + with 'body' field containing the markdown comment text. +""" import json import sys diff --git a/.gitea/workflows/helpers/class-txt-check.py b/.gitea/workflows/helpers/class-txt-check.py index 592c0432b..aae424231 100755 --- a/.gitea/workflows/helpers/class-txt-check.py +++ b/.gitea/workflows/helpers/class-txt-check.py @@ -1,5 +1,26 @@ #!/usr/bin/env python3 -"""Check CLASS.TXT.json files for duplicate titles under the same parent.""" +""" +Check CLASS.TXT.json files for duplicate titles under the same parent document. + +This script validates that child documents under the same parent (p_code) have +unique titles. Comparison is case-insensitive, so "Creating an ECS" and +"creating an ecs" are considered duplicates. + +JSON structure: + - Each entry has: code (document ID), p_code (parent document ID), title + - Documents with same p_code are siblings under the same parent + - Siblings must have unique titles (case-insensitive) + +Example violation: + Parent (code="3", title="Virtual Private Cloud") + - Child (code="4", p_code="3", title="Creating a VPC") + - Child (code="5", p_code="3", title="creating a vpc") <- DUPLICATE! + +Usage: + Set CHANGED_FILES environment variable with space-separated list of CLASS.TXT.json files. + Exits with code 1 if violations found, 0 otherwise. + Writes violations to violations.json for comment generation. +""" import sys import os diff --git a/.gitea/workflows/helpers/metadata-check.py b/.gitea/workflows/helpers/metadata-check.py index 45fd493b8..bdae103c2 100755 --- a/.gitea/workflows/helpers/metadata-check.py +++ b/.gitea/workflows/helpers/metadata-check.py @@ -1,5 +1,30 @@ #!/usr/bin/env python3 -"""Check that newly added HTML files are registered in metadata files.""" +""" +Check that newly added HTML files are registered in both metadata files. + +This script validates that every new HTML file added in a PR is properly +registered in both CLASS.TXT.json and ALL_META.TXT.json in the same directory. + +Directory structure: + docs/vpc/api-ref/ + ├── CLASS.TXT.json # Contains uri references to HTML files + ├── ALL_META.TXT.json # Contains uri references to HTML files + └── en-us_topic_XXXXX.html + +Both metadata files use the "uri" field to reference HTML filenames. When a +new HTML file is added, it must be added to both metadata files. + +Checks performed: + 1. Does CLASS.TXT.json exist in the HTML file's directory? + 2. Does ALL_META.TXT.json exist in the HTML file's directory? + 3. Is the HTML filename listed in CLASS.TXT.json (in uri field)? + 4. Is the HTML filename listed in ALL_META.TXT.json (in uri field)? + +Usage: + Set ADDED_FILES environment variable with space-separated list of added HTML files. + Exits with code 1 if violations found, 0 otherwise. + Writes violations to violations.json for comment generation. +""" import sys import os diff --git a/.gitea/workflows/helpers/metadata-comment.py b/.gitea/workflows/helpers/metadata-comment.py index 96cf7f1c1..0a9778ea4 100755 --- a/.gitea/workflows/helpers/metadata-comment.py +++ b/.gitea/workflows/helpers/metadata-comment.py @@ -1,5 +1,17 @@ #!/usr/bin/env python3 -"""Generate PR comment from metadata check violations.""" +""" +Generate PR comment for metadata registration check violations. + +This script reads violations.json (created by metadata-check.py) and generates +a formatted markdown comment to be posted on the PR. The comment includes: + - HTML file path that was added but not registered + - Which metadata files it's missing from (CLASS.TXT.json, ALL_META.TXT.json) + - Whether the metadata files don't exist or the file is simply missing from them + +Usage: + Run after metadata-check.py fails. Reads violations.json and outputs JSON + with 'body' field containing the markdown comment text. +""" import json import sys diff --git a/.gitea/workflows/helpers/underscore-check.py b/.gitea/workflows/helpers/underscore-check.py index 74faf1395..37053edaa 100755 --- a/.gitea/workflows/helpers/underscore-check.py +++ b/.gitea/workflows/helpers/underscore-check.py @@ -1,5 +1,21 @@ #!/usr/bin/env python3 -"""Underscore check script for HTML files.""" +""" +Underscore check script for HTML files. + +This script checks changed HTML files for words ending with underscore (_) that are +not followed by alphanumeric characters. It uses BeautifulSoup to parse HTML and +extract only visible text content (ignoring class names, IDs, script/style content). + +Example violations: + - "test_" in text "Query the list whose names contain test_." (FLAGGED) + - "test_123" in text "Query the list called test_123" (NOT flagged - underscore followed by digits) + +Usage: + Set CHANGED_FILES environment variable with space-separated list of HTML files to check. + Exits with code 1 if violations found, 0 otherwise. + Writes violations to violations.json for comment generation. +""" + import sys import os import re @@ -11,9 +27,9 @@ from bs4 import BeautifulSoup def is_binary_file(filepath): """Check if file is binary by reading first bytes.""" try: - with open(filepath, 'rb') as f: + with open(filepath, "rb") as f: chunk = f.read(1024) - if b'\x00' in chunk: + if b"\x00" in chunk: return True return False except: @@ -22,12 +38,12 @@ def is_binary_file(filepath): def find_line_number(content, text_segment, start_line=0): """Find the line number where text_segment appears in content.""" - lines = content.split('\n') + lines = content.split("\n") search_text = text_segment.strip() - + # Escape special regex characters in search text escaped_search = re.escape(search_text) - + for i, line in enumerate(lines[start_line:], start=start_line + 1): if re.search(escaped_search, line): return i @@ -36,36 +52,36 @@ def find_line_number(content, text_segment, start_line=0): def extract_text_with_positions(html_content): """Extract visible text from HTML.""" - soup = BeautifulSoup(html_content, 'lxml') - + soup = BeautifulSoup(html_content, "lxml") + # Remove script and style elements - for element in soup(['script', 'style', 'pre', 'code']): + for element in soup(["script", "style", "pre", "code"]): element.decompose() - + results = [] - + def process_element(element, path=""): """Recursively process elements and extract text.""" if element.name is None: # It's a NavigableString text = element.get_text().strip() if text: # Clean up whitespace - text = ' '.join(text.split()) + text = " ".join(text.split()) if text: results.append(text) return - + # Get text directly from this element (not children) for child in element.children: if child.name is None: # Text node text = child.get_text().strip() if text: - text = ' '.join(text.split()) + text = " ".join(text.split()) if text: results.append(text) else: process_element(child, f"{path}>{element.name}") - + process_element(soup) return results @@ -73,80 +89,80 @@ def extract_text_with_positions(html_content): def check_underscore_violations(text): """Check for words ending with underscore (not followed by alphanumeric).""" # Pattern: word boundary + alphanumeric chars + underscore + not followed by alphanumeric - pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])' + pattern = r"\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])" matches = re.findall(pattern, text) return matches def get_context_line(content, line_num, chars=50): """Get context around the violation.""" - - lines = content.split('\n') + + lines = content.split("\n") if line_num < 1 or line_num > len(lines): return "" - + line = lines[line_num - 1] - - pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])' + + pattern = r"\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])" match = re.search(pattern, line) - + if match: # Calculate start and end positions for context window start = max(0, match.start() - chars) end = min(len(line), match.end() + chars) - + # Extract the context context = line[start:end] - + if start > 0: context = "..." + context - + if end < len(line): context = context + "..." - + return context - + # If no match found, return a shortened version of the line max_length = chars * 2 if len(line) > max_length: return line[:max_length] + "..." - + return line def main(): # Get changed files from environment - changed_files_str = os.environ.get('CHANGED_FILES', '') + changed_files_str = os.environ.get("CHANGED_FILES", "") changed_files = [f.strip() for f in changed_files_str.split() if f.strip()] - + if not changed_files: print("No HTML files changed in this PR") sys.exit(0) - + print(f"Checking {len(changed_files)} HTML file(s): {changed_files}") - + all_violations = [] - + for filepath in changed_files: if not os.path.exists(filepath): print(f"Warning: File not found: {filepath}") continue - + # Skip binary files if is_binary_file(filepath): print(f"Skipping binary file: {filepath}") continue - + try: - with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + with open(filepath, "r", encoding="utf-8", errors="ignore") as f: content = f.read() except Exception as e: print(f"Error reading {filepath}: {e}") continue - + # Extract visible text from HTML text_segments = extract_text_with_positions(content) - + # Check each text segment for violations for segment in text_segments: violations = check_underscore_violations(segment) @@ -156,39 +172,43 @@ def main(): if line_num: context = get_context_line(content, line_num) for word in violations: - all_violations.append({ - 'file': filepath, - 'line': line_num, - 'word': word + '_', - 'context': context - }) - + all_violations.append( + { + "file": filepath, + "line": line_num, + "word": word + "_", + "context": context, + } + ) + # Output results if all_violations: # Remove duplicates unique_violations = [] seen = set() for v in all_violations: - key = (v['file'], v['line'], v['word']) + key = (v["file"], v["line"], v["word"]) if key not in seen: seen.add(key) unique_violations.append(v) - + all_violations = unique_violations - + # Write violations to JSON file for later use - with open('violations.json', 'w') as f: + with open("violations.json", "w") as f: json.dump(all_violations, f, indent=2) - + print(f"\nFound {len(all_violations)} violation(s):") for v in all_violations: - print(f" {v['file']}:{v['line']} - '{v['word']}' in context: {v['context']}") - + print( + f" {v['file']}:{v['line']} - '{v['word']}' in context: {v['context']}" + ) + sys.exit(1) else: print("\nNo violations found") sys.exit(0) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/.gitea/workflows/helpers/underscore-comment.py b/.gitea/workflows/helpers/underscore-comment.py index 36c30499a..74d2f8785 100755 --- a/.gitea/workflows/helpers/underscore-comment.py +++ b/.gitea/workflows/helpers/underscore-comment.py @@ -1,24 +1,38 @@ #!/usr/bin/env python3 -"""Generate PR comment from violations.""" +""" +Generate PR comment for underscore check violations. + +This script reads violations.json (created by underscore-check.py) and generates +a formatted markdown comment to be posted on the PR. The comment includes: + - File path where violations were found + - Line number of each violation + - The offending word (ending with underscore) + - Context showing where the violation appears in the HTML + +Usage: + Run after underscore-check.py fails. Reads violations.json and outputs JSON + with 'body' field containing the markdown comment text. +""" + import json import sys def main(): try: - with open('violations.json', 'r') as f: + with open("violations.json", "r") as f: violations = json.load(f) except Exception: violations = [] if not violations: - print(json.dumps({'body': 'No violations to report'})) + print(json.dumps({"body": "No violations to report"})) sys.exit(0) # Group violations by file by_file = {} for v in violations: - key = v['file'] + key = v["file"] if key not in by_file: by_file[key] = [] by_file[key].append(v) @@ -28,25 +42,27 @@ def main(): "❌ **Underscore check failed**", "", "Found words ending with underscore (not followed by alphanumeric characters):", - "" + "", ] for filepath, file_violations in by_file.items(): lines.append(f"**{filepath}:**") for v in file_violations: - word = v['word'] - line_num = v['line'] - context = v['context'] + word = v["word"] + line_num = v["line"] + context = v["context"] # Escape markdown special chars in context - context = context.replace('`', '\\`') + context = context.replace("`", "\\`") lines.append(f" - Line {line_num}: `{word}` in context: `{context}`") lines.append("") - lines.append("**Please fix these issues as soon as possible.** Words should not end with an underscore unless followed by alphanumeric characters (A-Za-z0-9).") + lines.append( + "**Please fix these issues as soon as possible.** Words should not end with an underscore unless followed by alphanumeric characters (A-Za-z0-9)." + ) message = "\n".join(lines) - print(json.dumps({'body': message})) + print(json.dumps({"body": message})) -if __name__ == '__main__': +if __name__ == "__main__": main()