forked from zulip/zulip
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_grep.py
63 lines (51 loc) · 1.79 KB
/
html_grep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from collections import defaultdict
from typing import Dict, List, Set
from .html_branches import html_branches, HtmlTreeBranch
def show_all_branches(fns):
# type: (List[str]) -> None
for fn in fns:
print(fn)
with open(fn, 'r') as f:
text = f.read()
branches = html_branches(text, fn=fn)
for branch in branches:
print(branch.text())
print('---')
class Grepper:
'''
A Grepper object is optimized to do repeated
searches of words that can be found in our
HtmlTreeBranch objects.
'''
def __init__(self, fns):
# type: (List[str]) -> None
all_branches = [] # type: List[HtmlTreeBranch]
for fn in fns:
with open(fn, 'r') as f:
text = f.read()
branches = html_branches(text, fn=fn)
all_branches += branches
self.word_dict = defaultdict(set) # type: Dict[str, Set[HtmlTreeBranch]]
for b in all_branches:
for word in b.words:
self.word_dict[word].add(b)
self.all_branches = set(all_branches)
def grep(self, word_set):
# type: (Set[str]) -> None
words = list(word_set) # type: List[str]
if len(words) == 0:
matches = self.all_branches
else:
matches = self.word_dict[words[0]]
for i in range(1, len(words)):
matches = matches & self.word_dict[words[i]]
branches = list(matches)
branches.sort(key=lambda branch: (branch.fn, branch.line))
for branch in branches:
print('%s %d' % (branch.fn, branch.line))
print(branch.staircase_text())
print('')
def grep(fns, words):
# type: (List[str], Set[str]) -> None
grepper = Grepper(fns)
grepper.grep(words)