doctools/spelling.py

OILS / doctools / spelling.py View on Github | oilshell.org

138 lines, 81 significant

1	#!/usr/bin/env python2
2	"""
3	spelling.py
4
5	Filter the output of 'lynx -dump' into a list of words to spell check.
6	"""
7	from __future__ import print_function
8
9	from collections import Counter
10	import optparse
11	import re
12	import sys
13
14	from doctools.util import log
15
16
17	def SplitWords(contents):
18	# Remove URLs so path components don't show up as words
19	contents = re.sub(r'(http\|https\|file)://\S+', '', contents)
20
21	# Take into account contractions with apostrophes
22	#
23	# - doesn't
24	# - can't
25
26	WORD_RE = re.compile(r'''
27	[a-zA-Z]+
28	(?:\'t\b)? # optional contraction
29	''', re.VERBOSE)
30
31	words = WORD_RE.findall(contents)
32
33	for w in words:
34	yield w
35
36
37	def WordList(f):
38	for line in f:
39	# no special characters allowed
40	yield line.strip()
41
42
43	def Options():
44	"""Returns an option parser instance."""
45	p = optparse.OptionParser()
46	p.add_option(
47	'--known-words', dest='known_words',
48	help='List of words like /usr/share/dict/words')
49	p.add_option(
50	'--more-than-bash', dest='more_than_bash', type=int, default=0,
51	help='Expected number of cases where OSH starts more processes than bash')
52	return p
53
54
55	def main(argv):
56	o = Options()
57	opts, argv = o.parse_args(argv[1:])
58
59	action = argv[0]
60
61	if action == 'word-split':
62	contents = sys.stdin.read()
63	for w in SplitWords(contents):
64	print(w)
65
66	elif action == 'check':
67	word_files = argv[1:]
68
69	d = Counter()
70
71	for path in word_files:
72	with open(path) as f:
73	for word in WordList(f):
74	d[word] += 1
75
76	print('')
77	print('Most common words')
78	print('')
79	for word, count in d.most_common()[:20]:
80	print('%10d %s' % (count, word))
81
82	print('')
83	print('Least common words')
84	print('')
85	for word, count in d.most_common()[-20:]:
86	print('%10d %s' % (count, word))
87
88	log('%d word files', len(word_files))
89	log('%d unique words', len(d))
90
91	known_words = {}
92	with open(opts.known_words) as f:
93	for w in WordList(f):
94	known_words[w] = True
95
96	print('')
97	print('Potential Misspellings')
98	print('')
99
100	for path in word_files:
101
102	print()
103	print('\t%s' % path)
104	print()
105
106	with open(path) as f:
107	unknown = {}
108	for w in WordList(f):
109	#if d.get(word) == 1:
110	# print(word)
111	if w.lower() not in known_words:
112	unknown[w] = True
113
114	if unknown:
115	for u in sorted(unknown):
116	# only occurs once
117	if d.get(u) == 1:
118	print(u)
119	log('\t%d unknown words in %s', len(unknown), path)
120
121
122	# Checking algorithms:
123	#
124	# - Does it appear in the dictionary? Problem: most computer terms
125	# - Does it appear only once or twice in the whole corpus?
126	# - Is the edit distance very close to a dictinoary word?
127	# - e.g. substitutions is a typo
128
129	else:
130	raise RuntimeError('Invalid action %r' % action)
131
132
133	if __name__ == '__main__':
134	try:
135	main(sys.argv)
136	except RuntimeError as e:
137	print('FATAL: %s' % e, file=sys.stderr)
138	sys.exit(1)