-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathhtml_extractor.rb
102 lines (89 loc) · 2.19 KB
/
html_extractor.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Copyright (C) 2009 Pascal Rettig.
module HtmlExtractor
def html_extract_text(source)
#ignore the following:
ignore = [ '>', '>>','<<',' ', ' ', '']
# We start off in text area
# Find the next, <, {, or end of document
lines = []
md = '';
pos = 0;
endpos = 0;
srcLen = source.length
while(pos < srcLen)
lt = source.index("<",pos)
if(lt != nil)
endpos = lt-1;
md='<';
else
endpos = srcLen-1;
md='end';
end
if(endpos >= pos)
txt = source[pos..endpos];
ttxt = txt.strip
if !ignore.include?(ttxt) && ttxt.length > 1 && !(ttxt =~ /^([[:punct:][:space:]]*)$/m)
lines << ttxt
end
end
tagpos = endpos+1;
pos = endpos + 2;
if md == '<'
if source[lt..(lt+7)] == '<script'
newpos = source.index("</script>",pos) + 9
if(newpos === nil)
pos = srcLen
else
pos = newpos;
end
elsif source[lt...(lt+6)] == '<style'
newpos = source.index("</style>",pos) + 9;
if(newpos === nil)
pos = srcLen
else
pos = newpos;
end
else
newpos = html_extract_close_token(source,">",pos);
if(newpos === nil)
pos = srcLen
else
pos = newpos
end
end
end
end
return lines
end
private
def html_extract_close_token(source,chr,pos)
sqt = 0;
dbl = 0;
ok = false
while !ok
close_pos = source.index(chr,pos)
return nil if(close_pos === nil)
if(close_pos > pos)
trial_str = source[pos..(close_pos-pos)]
i=0
while(i < trial_str.length)
tChr = trial_str[i]
case tChr
when '\\'; i+=1
when "'"; sqt+=1
when '"'; dbl+=1
end
i+=1
end
if((sqt % 2 == 0) && (dbl % 2 == 0))
ok = true;
else
pos = close_pos+1;
end
else
return nil
end
end
return close_pos+1;
end
end