-
Notifications
You must be signed in to change notification settings - Fork 4
/
check_spelling.py
153 lines (124 loc) · 5.6 KB
/
check_spelling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import re
import spacy
from markdown_it import MarkdownIt
from spellchecker import SpellChecker
import sys
# Load the spaCy model and spell checker
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()
# Load custom dictionary
custom_dict_path = 'dictionaries/custom_dict.txt'
with open(custom_dict_path, 'r') as f:
custom_words = set(line.strip().lower() for line in f)
# Regex patterns
import_pattern = re.compile(r'import\s*{\s*([\s\S]*?)\s*}\s*from\s*["\']([^"\']+)["\'];', re.MULTILINE)
jsx_like_tags_pattern = re.compile(r'<[^>]*>[\s\S]*?<\/[^>]*>|<[^>]*?/>', re.DOTALL)
path_pattern = re.compile(r'path:\s*"/[^"]*"')
guidebox_pattern = re.compile(r'<GuideBox[\s\S]*?/>', re.IGNORECASE)
hex_colours = re.compile(r'([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})')
# Corrected word pattern to allow apostrophes in valid words
word_pattern = re.compile(r"\b\w+(?:'\w+)?\b")
# Pattern to exclude words containing escape sequences (\n, \u, etc.)
escape_sequence_pattern = re.compile(r'\\[nu][0-9a-fA-F]+|u[0-9a-fA-F]{4}')
# Function to extract text while ignoring specified components and handling code blocks differently
def extract_text_from_mdx(file_path):
with open(file_path, 'r') as file:
content = file.read()
# Remove import statements
content = import_pattern.sub('', content)
# Remove paths and GuideBox components
content = path_pattern.sub('', content)
content = guidebox_pattern.sub('', content)
# Remove JSX components and JSX-like tags
content = jsx_like_tags_pattern.sub('', content)
# Initialize the Markdown parser
md = MarkdownIt()
parsed = md.parse(content)
# Extract text while separating code blocks for warnings
text = []
code_blocks = []
in_code_block = False
def traverse(node):
nonlocal in_code_block
if node.type == 'fence':
if not in_code_block:
code_blocks.append(node.content) # Capture code block content
in_code_block = not in_code_block
elif node.type == 'code_inline' and not in_code_block:
return
elif node.type == 'text' and not in_code_block:
text.append(node.content)
for child in node.children or []:
traverse(child)
for node in parsed:
traverse(node)
return '\n'.join(text), code_blocks
# Function to check for spelling errors
def check_spelling(text, is_code_block=False):
def split_underscore_words(word):
return re.split(r'[_\s]+', word)
# Use the updated word pattern to find words
words = word_pattern.findall(text)
processed_words = []
for word in words:
if '_' in word:
processed_words.extend(split_underscore_words(word))
else:
processed_words.append(word)
# Patterns to exclude
n_prefix_pattern = re.compile(r'\bn\w+')
css_value_pattern = re.compile(r'^\d+(px|%|em|rem|vh|vw|pt|cm|mm|in|s|ms|deg)?$') # CSS values
hex_color_pattern = re.compile(r'^(#?[A-Fa-f0-9]{3}|#?[A-Fa-f0-9]{6})$') # Hex colors
eth_address_pattern = re.compile(r'^0x[a-fA-F0-9]{40}$') # Ethereum addresses
hash_pattern = re.compile(r'^[a-f0-9]{40}$') # Hash-like strings (40 hex characters)
# Filter out custom words, valid words with apostrophes,
# words matching escape sequences, "n-prefixed" words, CSS values, hex colors, ETH addresses, and hash strings
reduced_words = [
i.lower() for i in processed_words
if (
i.lower() not in custom_words
and not escape_sequence_pattern.search(i)
and "'" not in i # Exclude words with apostrophes for misspelling check
and not n_prefix_pattern.match(i) # Exclude "n-prefixed" words
and not css_value_pattern.match(i) # Exclude CSS values
and not hex_color_pattern.match(i) # Exclude hex colors
and not eth_address_pattern.match(i) # Exclude Ethereum addresses
and not hash_pattern.match(i) # Exclude hash-like strings
and i.strip() # Exclude empty strings
)
]
misspelled = spell.unknown(reduced_words)
# Return misspelled words with a flag indicating if they came from code
return misspelled if not is_code_block else {'warnings': misspelled}
# Function to check all .mdx files in a directory
def check_directory(directory):
has_errors = False
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.mdx'):
file_path = os.path.join(root, file)
print(f'========== Checking file: {file_path} ==========')
# Extract text and code blocks from the MDX file
text, code_blocks = extract_text_from_mdx(file_path)
# Check for spelling errors in text
errors = check_spelling(text)
if errors:
print(f'Spelling errors in {file_path}:')
for error in errors:
print(f' - {error}')
has_errors = True
# Check for spelling errors in code blocks (warnings)
warnings = []
for code_block in code_blocks:
warnings = check_spelling(code_block, is_code_block=True).get('warnings', [])
if warnings:
print(f'Warnings (spelling errors in code block) in {file_path}:')
for warning in warnings:
print(f' - {warning}')
return has_errors
# Directory to check
directory_path = 'pages'
has_errors = check_directory(directory_path)
# Return False if errors were found
sys.exit(1 if has_errors else 0)