Files
doc-exports/.gitea/workflows/helpers/underscore-check.py

195 lines
5.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""Underscore check script for HTML files."""
import sys
import os
import re
import json
from pathlib import Path
from bs4 import BeautifulSoup
def is_binary_file(filepath):
"""Check if file is binary by reading first bytes."""
try:
with open(filepath, 'rb') as f:
chunk = f.read(1024)
if b'\x00' in chunk:
return True
return False
except:
return True
def find_line_number(content, text_segment, start_line=0):
"""Find the line number where text_segment appears in content."""
lines = content.split('\n')
search_text = text_segment.strip()
# Escape special regex characters in search text
escaped_search = re.escape(search_text)
for i, line in enumerate(lines[start_line:], start=start_line + 1):
if re.search(escaped_search, line):
return i
return None
def extract_text_with_positions(html_content):
"""Extract visible text from HTML."""
soup = BeautifulSoup(html_content, 'lxml')
# Remove script and style elements
for element in soup(['script', 'style', 'pre', 'code']):
element.decompose()
results = []
def process_element(element, path=""):
"""Recursively process elements and extract text."""
if element.name is None: # It's a NavigableString
text = element.get_text().strip()
if text:
# Clean up whitespace
text = ' '.join(text.split())
if text:
results.append(text)
return
# Get text directly from this element (not children)
for child in element.children:
if child.name is None: # Text node
text = child.get_text().strip()
if text:
text = ' '.join(text.split())
if text:
results.append(text)
else:
process_element(child, f"{path}>{element.name}")
process_element(soup)
return results
def check_underscore_violations(text):
"""Check for words ending with underscore (not followed by alphanumeric)."""
# Pattern: word boundary + alphanumeric chars + underscore + not followed by alphanumeric
pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])'
matches = re.findall(pattern, text)
return matches
def get_context_line(content, line_num, chars=50):
"""Get context around the violation."""
lines = content.split('\n')
if line_num < 1 or line_num > len(lines):
return ""
line = lines[line_num - 1]
pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])'
match = re.search(pattern, line)
if match:
# Calculate start and end positions for context window
start = max(0, match.start() - chars)
end = min(len(line), match.end() + chars)
# Extract the context
context = line[start:end]
if start > 0:
context = "..." + context
if end < len(line):
context = context + "..."
return context
# If no match found, return a shortened version of the line
max_length = chars * 2
if len(line) > max_length:
return line[:max_length] + "..."
return line
def main():
# Get changed files from environment
changed_files_str = os.environ.get('CHANGED_FILES', '')
changed_files = [f.strip() for f in changed_files_str.split() if f.strip()]
if not changed_files:
print("No HTML files changed in this PR")
sys.exit(0)
print(f"Checking {len(changed_files)} HTML file(s): {changed_files}")
all_violations = []
for filepath in changed_files:
if not os.path.exists(filepath):
print(f"Warning: File not found: {filepath}")
continue
# Skip binary files
if is_binary_file(filepath):
print(f"Skipping binary file: {filepath}")
continue
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
except Exception as e:
print(f"Error reading {filepath}: {e}")
continue
# Extract visible text from HTML
text_segments = extract_text_with_positions(content)
# Check each text segment for violations
for segment in text_segments:
violations = check_underscore_violations(segment)
if violations:
# Find line number for this segment
line_num = find_line_number(content, segment)
if line_num:
context = get_context_line(content, line_num)
for word in violations:
all_violations.append({
'file': filepath,
'line': line_num,
'word': word + '_',
'context': context
})
# Output results
if all_violations:
# Remove duplicates
unique_violations = []
seen = set()
for v in all_violations:
key = (v['file'], v['line'], v['word'])
if key not in seen:
seen.add(key)
unique_violations.append(v)
all_violations = unique_violations
# Write violations to JSON file for later use
with open('violations.json', 'w') as f:
json.dump(all_violations, f, indent=2)
print(f"\nFound {len(all_violations)} violation(s):")
for v in all_violations:
print(f" {v['file']}:{v['line']} - '{v['word']}' in context: {v['context']}")
sys.exit(1)
else:
print("\nNo violations found")
sys.exit(0)
if __name__ == '__main__':
main()