OILS / doctools / spelling.py View on Github | oilshell.org

138 lines, 81 significant
1#!/usr/bin/env python2
2"""
3spelling.py
4
5Filter the output of 'lynx -dump' into a list of words to spell check.
6"""
7from __future__ import print_function
8
9from collections import Counter
10import optparse
11import re
12import sys
13
14from doctools.util import log
15
16
17def SplitWords(contents):
18 # Remove URLs so path components don't show up as words
19 contents = re.sub(r'(http|https|file)://\S+', '', contents)
20
21 # Take into account contractions with apostrophes
22 #
23 # - doesn't
24 # - can't
25
26 WORD_RE = re.compile(r'''
27 [a-zA-Z]+
28 (?:\'t\b)? # optional contraction
29 ''', re.VERBOSE)
30
31 words = WORD_RE.findall(contents)
32
33 for w in words:
34 yield w
35
36
37def WordList(f):
38 for line in f:
39 # no special characters allowed
40 yield line.strip()
41
42
43def Options():
44 """Returns an option parser instance."""
45 p = optparse.OptionParser()
46 p.add_option(
47 '--known-words', dest='known_words',
48 help='List of words like /usr/share/dict/words')
49 p.add_option(
50 '--more-than-bash', dest='more_than_bash', type=int, default=0,
51 help='Expected number of cases where OSH starts more processes than bash')
52 return p
53
54
55def main(argv):
56 o = Options()
57 opts, argv = o.parse_args(argv[1:])
58
59 action = argv[0]
60
61 if action == 'word-split':
62 contents = sys.stdin.read()
63 for w in SplitWords(contents):
64 print(w)
65
66 elif action == 'check':
67 word_files = argv[1:]
68
69 d = Counter()
70
71 for path in word_files:
72 with open(path) as f:
73 for word in WordList(f):
74 d[word] += 1
75
76 print('')
77 print('Most common words')
78 print('')
79 for word, count in d.most_common()[:20]:
80 print('%10d %s' % (count, word))
81
82 print('')
83 print('Least common words')
84 print('')
85 for word, count in d.most_common()[-20:]:
86 print('%10d %s' % (count, word))
87
88 log('%d word files', len(word_files))
89 log('%d unique words', len(d))
90
91 known_words = {}
92 with open(opts.known_words) as f:
93 for w in WordList(f):
94 known_words[w] = True
95
96 print('')
97 print('Potential Misspellings')
98 print('')
99
100 for path in word_files:
101
102 print()
103 print('\t%s' % path)
104 print()
105
106 with open(path) as f:
107 unknown = {}
108 for w in WordList(f):
109 #if d.get(word) == 1:
110 # print(word)
111 if w.lower() not in known_words:
112 unknown[w] = True
113
114 if unknown:
115 for u in sorted(unknown):
116 # only occurs once
117 if d.get(u) == 1:
118 print(u)
119 log('\t%d unknown words in %s', len(unknown), path)
120
121
122 # Checking algorithms:
123 #
124 # - Does it appear in the dictionary? Problem: most computer terms
125 # - Does it appear only once or twice in the whole corpus?
126 # - Is the edit distance very close to a dictinoary word?
127 # - e.g. substitutions is a typo
128
129 else:
130 raise RuntimeError('Invalid action %r' % action)
131
132
133if __name__ == '__main__':
134 try:
135 main(sys.argv)
136 except RuntimeError as e:
137 print('FATAL: %s' % e, file=sys.stderr)
138 sys.exit(1)