diff --git a/.gitea/workflows/docs-precheck.yml b/.gitea/workflows/docs-precheck.yml new file mode 100644 index 000000000..665812243 --- /dev/null +++ b/.gitea/workflows/docs-precheck.yml @@ -0,0 +1,75 @@ +# .gitea/workflows/docs-precheck.yml +name: Docs Precheck - Underscore Check + +on: + pull_request: + types: [opened, reopened, synchronize, edited] + +permissions: + contents: read + pull-requests: write + +jobs: + docs-precheck: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: pip install beautifulsoup4 lxml + + - name: Get changed HTML files + id: changed-files + run: | + BASE_SHA="${{ gitea.event.pull_request.base.sha }}" + git fetch origin $BASE_SHA + changed=$(git diff --name-only ${BASE_SHA}...HEAD | grep -E '\.(html|htm)$' | tr '\n' ' ') + echo "files=$changed" >> $GITHUB_OUTPUT + echo "CHANGED_FILES=$changed" >> $GITHUB_ENV + echo "Changed HTML files: $changed" + + - name: Run underscore check + id: underscore-check + run: | + python3 .gitea/workflows/underscore-check.py + + - name: Comment on PR with violations + if: failure() && steps.underscore-check.outcome == 'failure' + env: + GITEA_URL: ${{ gitea.server_url }} + REPO: ${{ gitea.repository }} + PR_NUMBER: ${{ gitea.event.pull_request.number }} + TOKEN: ${{ gitea.token }} + run: | + set -euo pipefail + + # Generate comment message + MSG=$(python3 .gitea/workflows/generate-comment.py) + echo "$MSG" + + # Extract body from JSON + BODY=$(echo "$MSG" | python3 -c "import sys, json; print(json.load(sys.stdin)['body'])") + + # Comment on PR + curl -sS --fail-with-body -X POST \\ + -H "Authorization: token ${TOKEN}" \\ + -H "Content-Type: application/json" \\ + "${GITEA_URL}/api/v1/repos/${REPO}/issues/${PR_NUMBER}/comments" \\ + -d "$(echo "$BODY" | python3 -c "import sys, json; print(json.dumps({'body': sys.stdin.read()}))")" + + - name: Final status + if: always() + run: | + if [ -f violations.json ]; then + echo "::error::Underscore check failed. See previous step for details." + exit 1 + fi diff --git a/.gitea/workflows/generate-comment.py b/.gitea/workflows/generate-comment.py new file mode 100755 index 000000000..36c30499a --- /dev/null +++ b/.gitea/workflows/generate-comment.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +"""Generate PR comment from violations.""" +import json +import sys + + +def main(): + try: + with open('violations.json', 'r') as f: + violations = json.load(f) + except Exception: + violations = [] + + if not violations: + print(json.dumps({'body': 'No violations to report'})) + sys.exit(0) + + # Group violations by file + by_file = {} + for v in violations: + key = v['file'] + if key not in by_file: + by_file[key] = [] + by_file[key].append(v) + + # Build message + lines = [ + "āŒ **Underscore check failed**", + "", + "Found words ending with underscore (not followed by alphanumeric characters):", + "" + ] + + for filepath, file_violations in by_file.items(): + lines.append(f"**{filepath}:**") + for v in file_violations: + word = v['word'] + line_num = v['line'] + context = v['context'] + # Escape markdown special chars in context + context = context.replace('`', '\\`') + lines.append(f" - Line {line_num}: `{word}` in context: `{context}`") + lines.append("") + + lines.append("**Please fix these issues as soon as possible.** Words should not end with an underscore unless followed by alphanumeric characters (A-Za-z0-9).") + + message = "\n".join(lines) + print(json.dumps({'body': message})) + + +if __name__ == '__main__': + main() diff --git a/.gitea/workflows/underscore-check.py b/.gitea/workflows/underscore-check.py new file mode 100755 index 000000000..772dfb09c --- /dev/null +++ b/.gitea/workflows/underscore-check.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +"""Underscore check script for HTML files.""" +import sys +import os +import re +import json +from pathlib import Path +from bs4 import BeautifulSoup + + +def is_binary_file(filepath): + """Check if file is binary by reading first bytes.""" + try: + with open(filepath, 'rb') as f: + chunk = f.read(1024) + if b'\x00' in chunk: + return True + return False + except: + return True + + +def find_line_number(content, text_segment, start_line=0): + """Find the line number where text_segment appears in content.""" + lines = content.split('\n') + search_text = text_segment.strip() + + # Escape special regex characters in search text + escaped_search = re.escape(search_text) + + for i, line in enumerate(lines[start_line:], start=start_line + 1): + if re.search(escaped_search, line): + return i + return None + + +def extract_text_with_positions(html_content): + """Extract visible text from HTML.""" + soup = BeautifulSoup(html_content, 'lxml') + + # Remove script and style elements + for element in soup(['script', 'style', 'pre', 'code']): + element.decompose() + + results = [] + + def process_element(element, path=""): + """Recursively process elements and extract text.""" + if element.name is None: # It's a NavigableString + text = element.get_text().strip() + if text: + # Clean up whitespace + text = ' '.join(text.split()) + if text: + results.append(text) + return + + # Get text directly from this element (not children) + for child in element.children: + if child.name is None: # Text node + text = child.get_text().strip() + if text: + text = ' '.join(text.split()) + if text: + results.append(text) + else: + process_element(child, f"{path}>{element.name}") + + process_element(soup) + return results + + +def check_underscore_violations(text): + """Check for words ending with underscore (not followed by alphanumeric).""" + # Pattern: word boundary + alphanumeric chars + underscore + not followed by alphanumeric + pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])' + matches = re.findall(pattern, text) + return matches + + +def get_context_line(content, line_num, chars=50): + """Get context around the violation.""" + lines = content.split('\n') + if line_num < 1 or line_num > len(lines): + return "" + line = lines[line_num - 1] + # Find the violation in the line + pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])' + match = re.search(pattern, line) + if match: + start = max(0, match.start() - chars) + end = min(len(line), match.end() + chars) + context = line[start:end] + if start > 0: + context = "..." + context + if end < len(line): + context = context + "..." + return context + return line[:chars*2] + "..." if len(line) > chars*2 else line + + +def main(): + # Get changed files from environment + changed_files_str = os.environ.get('CHANGED_FILES', '') + changed_files = [f.strip() for f in changed_files_str.split() if f.strip()] + + if not changed_files: + print("No HTML files changed in this PR") + sys.exit(0) + + print(f"Checking {len(changed_files)} HTML file(s): {changed_files}") + + all_violations = [] + + for filepath in changed_files: + if not os.path.exists(filepath): + print(f"Warning: File not found: {filepath}") + continue + + # Skip binary files + if is_binary_file(filepath): + print(f"Skipping binary file: {filepath}") + continue + + try: + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + except Exception as e: + print(f"Error reading {filepath}: {e}") + continue + + # Extract visible text from HTML + text_segments = extract_text_with_positions(content) + + # Check each text segment for violations + for segment in text_segments: + violations = check_underscore_violations(segment) + if violations: + # Find line number for this segment + line_num = find_line_number(content, segment) + if line_num: + context = get_context_line(content, line_num) + for word in violations: + all_violations.append({ + 'file': filepath, + 'line': line_num, + 'word': word + '_', + 'context': context + }) + + # Output results + if all_violations: + # Remove duplicates + unique_violations = [] + seen = set() + for v in all_violations: + key = (v['file'], v['line'], v['word']) + if key not in seen: + seen.add(key) + unique_violations.append(v) + + all_violations = unique_violations + + # Write violations to JSON file for later use + with open('violations.json', 'w') as f: + json.dump(all_violations, f, indent=2) + + print(f"\nāŒ Found {len(all_violations)} violation(s):") + for v in all_violations: + print(f" {v['file']}:{v['line']} - '{v['word']}' in context: {v['context']}") + + sys.exit(1) + else: + print("\nāœ… No violations found") + sys.exit(0) + + +if __name__ == '__main__': + main()