doc-exports/.gitea/workflows/helpers/underscore-check.py

#!/usr/bin/env python3
"""Underscore check script for HTML files."""
import sys
import os
import re
import json
from pathlib import Path
from bs4 import BeautifulSoup


def is_binary_file(filepath):
    """Check if file is binary by reading first bytes."""
    try:
        with open(filepath, 'rb') as f:
            chunk = f.read(1024)
            if b'\x00' in chunk:
                return True
        return False
    except:
        return True


def find_line_number(content, text_segment, start_line=0):
    """Find the line number where text_segment appears in content."""
    lines = content.split('\n')
    search_text = text_segment.strip()

    # Escape special regex characters in search text
    escaped_search = re.escape(search_text)

    for i, line in enumerate(lines[start_line:], start=start_line + 1):
        if re.search(escaped_search, line):
            return i
    return None


def extract_text_with_positions(html_content):
    """Extract visible text from HTML."""
    soup = BeautifulSoup(html_content, 'lxml')

    # Remove script and style elements
    for element in soup(['script', 'style', 'pre', 'code']):
        element.decompose()

    results = []

    def process_element(element, path=""):
        """Recursively process elements and extract text."""
        if element.name is None:  # It's a NavigableString
            text = element.get_text().strip()
            if text:
                # Clean up whitespace
                text = ' '.join(text.split())
                if text:
                    results.append(text)
            return

        # Get text directly from this element (not children)
        for child in element.children:
            if child.name is None:  # Text node
                text = child.get_text().strip()
                if text:
                    text = ' '.join(text.split())
                    if text:
                        results.append(text)
            else:
                process_element(child, f"{path}>{element.name}")

    process_element(soup)
    return results


def check_underscore_violations(text):
    """Check for words ending with underscore (not followed by alphanumeric)."""
    # Pattern: word boundary + alphanumeric chars + underscore + not followed by alphanumeric
    pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])'
    matches = re.findall(pattern, text)
    return matches


def get_context_line(content, line_num, chars=50):
    """Get context around the violation."""

    lines = content.split('\n')
    if line_num < 1 or line_num > len(lines):
        return ""

    line = lines[line_num - 1]

    pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])'
    match = re.search(pattern, line)

    if match:
        # Calculate start and end positions for context window
        start = max(0, match.start() - chars)
        end = min(len(line), match.end() + chars)

        # Extract the context
        context = line[start:end]

        if start > 0:
            context = "..." + context

        if end < len(line):
            context = context + "..."

        return context

    # If no match found, return a shortened version of the line
    max_length = chars * 2
    if len(line) > max_length:
        return line[:max_length] + "..."

    return line


def main():
    # Get changed files from environment
    changed_files_str = os.environ.get('CHANGED_FILES', '')
    changed_files = [f.strip() for f in changed_files_str.split() if f.strip()]

    if not changed_files:
        print("No HTML files changed in this PR")
        sys.exit(0)

    print(f"Checking {len(changed_files)} HTML file(s): {changed_files}")

    all_violations = []

    for filepath in changed_files:
        if not os.path.exists(filepath):
            print(f"Warning: File not found: {filepath}")
            continue

        # Skip binary files
        if is_binary_file(filepath):
            print(f"Skipping binary file: {filepath}")
            continue

        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
        except Exception as e:
            print(f"Error reading {filepath}: {e}")
            continue

        # Extract visible text from HTML
        text_segments = extract_text_with_positions(content)

        # Check each text segment for violations
        for segment in text_segments:
            violations = check_underscore_violations(segment)
            if violations:
                # Find line number for this segment
                line_num = find_line_number(content, segment)
                if line_num:
                    context = get_context_line(content, line_num)
                    for word in violations:
                        all_violations.append({
                            'file': filepath,
                            'line': line_num,
                            'word': word + '_',
                            'context': context
                        })

    # Output results
    if all_violations:
        # Remove duplicates
        unique_violations = []
        seen = set()
        for v in all_violations:
            key = (v['file'], v['line'], v['word'])
            if key not in seen:
                seen.add(key)
                unique_violations.append(v)

        all_violations = unique_violations

        # Write violations to JSON file for later use
        with open('violations.json', 'w') as f:
            json.dump(all_violations, f, indent=2)

        print(f"\nFound {len(all_violations)} violation(s):")
        for v in all_violations:
            print(f"   {v['file']}:{v['line']} - '{v['word']}' in context: {v['context']}")

        sys.exit(1)
    else:
        print("\nNo violations found")
        sys.exit(0)


if __name__ == '__main__':
    main()