forked from docs/doc-exports
195 lines
5.9 KiB
Python
Executable File
195 lines
5.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Underscore check script for HTML files."""
|
|
import sys
|
|
import os
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def is_binary_file(filepath):
|
|
"""Check if file is binary by reading first bytes."""
|
|
try:
|
|
with open(filepath, 'rb') as f:
|
|
chunk = f.read(1024)
|
|
if b'\x00' in chunk:
|
|
return True
|
|
return False
|
|
except:
|
|
return True
|
|
|
|
|
|
def find_line_number(content, text_segment, start_line=0):
|
|
"""Find the line number where text_segment appears in content."""
|
|
lines = content.split('\n')
|
|
search_text = text_segment.strip()
|
|
|
|
# Escape special regex characters in search text
|
|
escaped_search = re.escape(search_text)
|
|
|
|
for i, line in enumerate(lines[start_line:], start=start_line + 1):
|
|
if re.search(escaped_search, line):
|
|
return i
|
|
return None
|
|
|
|
|
|
def extract_text_with_positions(html_content):
|
|
"""Extract visible text from HTML."""
|
|
soup = BeautifulSoup(html_content, 'lxml')
|
|
|
|
# Remove script and style elements
|
|
for element in soup(['script', 'style', 'pre', 'code']):
|
|
element.decompose()
|
|
|
|
results = []
|
|
|
|
def process_element(element, path=""):
|
|
"""Recursively process elements and extract text."""
|
|
if element.name is None: # It's a NavigableString
|
|
text = element.get_text().strip()
|
|
if text:
|
|
# Clean up whitespace
|
|
text = ' '.join(text.split())
|
|
if text:
|
|
results.append(text)
|
|
return
|
|
|
|
# Get text directly from this element (not children)
|
|
for child in element.children:
|
|
if child.name is None: # Text node
|
|
text = child.get_text().strip()
|
|
if text:
|
|
text = ' '.join(text.split())
|
|
if text:
|
|
results.append(text)
|
|
else:
|
|
process_element(child, f"{path}>{element.name}")
|
|
|
|
process_element(soup)
|
|
return results
|
|
|
|
|
|
def check_underscore_violations(text):
|
|
"""Check for words ending with underscore (not followed by alphanumeric)."""
|
|
# Pattern: word boundary + alphanumeric chars + underscore + not followed by alphanumeric
|
|
pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])'
|
|
matches = re.findall(pattern, text)
|
|
return matches
|
|
|
|
|
|
def get_context_line(content, line_num, chars=50):
|
|
"""Get context around the violation."""
|
|
|
|
lines = content.split('\n')
|
|
if line_num < 1 or line_num > len(lines):
|
|
return ""
|
|
|
|
line = lines[line_num - 1]
|
|
|
|
pattern = r'\b([A-Za-z0-9]+)_\b(?![A-Za-z0-9])'
|
|
match = re.search(pattern, line)
|
|
|
|
if match:
|
|
# Calculate start and end positions for context window
|
|
start = max(0, match.start() - chars)
|
|
end = min(len(line), match.end() + chars)
|
|
|
|
# Extract the context
|
|
context = line[start:end]
|
|
|
|
if start > 0:
|
|
context = "..." + context
|
|
|
|
if end < len(line):
|
|
context = context + "..."
|
|
|
|
return context
|
|
|
|
# If no match found, return a shortened version of the line
|
|
max_length = chars * 2
|
|
if len(line) > max_length:
|
|
return line[:max_length] + "..."
|
|
|
|
return line
|
|
|
|
|
|
def main():
|
|
# Get changed files from environment
|
|
changed_files_str = os.environ.get('CHANGED_FILES', '')
|
|
changed_files = [f.strip() for f in changed_files_str.split() if f.strip()]
|
|
|
|
if not changed_files:
|
|
print("No HTML files changed in this PR")
|
|
sys.exit(0)
|
|
|
|
print(f"Checking {len(changed_files)} HTML file(s): {changed_files}")
|
|
|
|
all_violations = []
|
|
|
|
for filepath in changed_files:
|
|
if not os.path.exists(filepath):
|
|
print(f"Warning: File not found: {filepath}")
|
|
continue
|
|
|
|
# Skip binary files
|
|
if is_binary_file(filepath):
|
|
print(f"Skipping binary file: {filepath}")
|
|
continue
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
continue
|
|
|
|
# Extract visible text from HTML
|
|
text_segments = extract_text_with_positions(content)
|
|
|
|
# Check each text segment for violations
|
|
for segment in text_segments:
|
|
violations = check_underscore_violations(segment)
|
|
if violations:
|
|
# Find line number for this segment
|
|
line_num = find_line_number(content, segment)
|
|
if line_num:
|
|
context = get_context_line(content, line_num)
|
|
for word in violations:
|
|
all_violations.append({
|
|
'file': filepath,
|
|
'line': line_num,
|
|
'word': word + '_',
|
|
'context': context
|
|
})
|
|
|
|
# Output results
|
|
if all_violations:
|
|
# Remove duplicates
|
|
unique_violations = []
|
|
seen = set()
|
|
for v in all_violations:
|
|
key = (v['file'], v['line'], v['word'])
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique_violations.append(v)
|
|
|
|
all_violations = unique_violations
|
|
|
|
# Write violations to JSON file for later use
|
|
with open('violations.json', 'w') as f:
|
|
json.dump(all_violations, f, indent=2)
|
|
|
|
print(f"\nFound {len(all_violations)} violation(s):")
|
|
for v in all_violations:
|
|
print(f" {v['file']}:{v['line']} - '{v['word']}' in context: {v['context']}")
|
|
|
|
sys.exit(1)
|
|
else:
|
|
print("\nNo violations found")
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|