doctools/spelling.py

OILS / doctools / spelling.py View on Github | oilshell.org

141 lines, 85 significant

1	#!/usr/bin/env python2
2	"""spelling.py.
3
4	Filter the output of 'lynx -dump' into a list of words to spell check.
5	"""
6	from __future__ import print_function
7
8	from collections import Counter
9	import optparse
10	import re
11	import sys
12
13	from doctools.util import log
14
15
16	def SplitWords(contents):
17	# Remove URLs so path components don't show up as words
18	contents = re.sub(r'(http\|https\|file)://\S+', '', contents)
19
20	# Take into account contractions with apostrophes
21	#
22	# - doesn't
23	# - can't
24
25	WORD_RE = re.compile(
26	r'''
27	[a-zA-Z]+
28	(?:\'t\b)? # optional contraction
29	''', re.VERBOSE)
30
31	words = WORD_RE.findall(contents)
32
33	for w in words:
34	yield w
35
36
37	def WordList(f):
38	for line in f:
39	# no special characters allowed
40	yield line.strip()
41
42
43	def Options():
44	"""Returns an option parser instance."""
45	p = optparse.OptionParser()
46	p.add_option('--known-words',
47	dest='known_words',
48	help='List of words like /usr/share/dict/words')
49	p.add_option(
50	'--more-than-bash',
51	dest='more_than_bash',
52	type=int,
53	default=0,
54	help=
55	'Expected number of cases where OSH starts more processes than bash')
56	return p
57
58
59	def main(argv):
60	o = Options()
61	opts, argv = o.parse_args(argv[1:])
62
63	action = argv[0]
64
65	if action == 'word-split':
66	contents = sys.stdin.read()
67	for w in SplitWords(contents):
68	print(w)
69
70	elif action == 'check':
71	word_files = argv[1:]
72
73	d = Counter()
74
75	for path in word_files:
76	with open(path) as f:
77	for word in WordList(f):
78	d[word] += 1
79
80	print('')
81	print('Most common words')
82	print('')
83	for word, count in d.most_common()[:20]:
84	print('%10d %s' % (count, word))
85
86	print('')
87	print('Least common words')
88	print('')
89	for word, count in d.most_common()[-20:]:
90	print('%10d %s' % (count, word))
91
92	log('%d word files', len(word_files))
93	log('%d unique words', len(d))
94
95	known_words = {}
96	with open(opts.known_words) as f:
97	for w in WordList(f):
98	known_words[w] = True
99
100	print('')
101	print('Potential Misspellings')
102	print('')
103
104	for path in word_files:
105
106	print()
107	print('\t%s' % path)
108	print()
109
110	with open(path) as f:
111	unknown = {}
112	for w in WordList(f):
113	#if d.get(word) == 1:
114	# print(word)
115	if w.lower() not in known_words:
116	unknown[w] = True
117
118	if unknown:
119	for u in sorted(unknown):
120	# only occurs once
121	if d.get(u) == 1:
122	print(u)
123	log('\t%d unknown words in %s', len(unknown), path)
124
125	# Checking algorithms:
126	#
127	# - Does it appear in the dictionary? Problem: most computer terms
128	# - Does it appear only once or twice in the whole corpus?
129	# - Is the edit distance very close to a dictinoary word?
130	# - e.g. substitutions is a typo
131
132	else:
133	raise RuntimeError('Invalid action %r' % action)
134
135
136	if __name__ == '__main__':
137	try:
138	main(sys.argv)
139	except RuntimeError as e:
140	print('FATAL: %s' % e, file=sys.stderr)
141	sys.exit(1)