forked from go-shiori/go-readability
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser-check.go
96 lines (82 loc) · 2.3 KB
/
parser-check.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
package readability
import (
"io"
"math"
"strings"
"github.com/go-shiori/dom"
"golang.org/x/net/html"
)
// Check checks whether the input is readable without parsing the whole thing.
func (ps *Parser) Check(input io.Reader) bool {
// Parse input
doc, err := dom.Parse(input)
if err != nil {
return false
}
return ps.CheckDocument(doc)
}
// CheckDocument checks whether the document is readable without parsing the whole thing.
func (ps *Parser) CheckDocument(doc *html.Node) bool {
// Get <p> and <pre> nodes.
// Also get <div> nodes which have <br> node(s) and append
// them into the `nodes` variable.
// Some articles' DOM structures might look like :
//
// <div>
// Sentences<br>
// <br>
// Sentences<br>
// </div>
//
// So we need to make sure only fetch the div once.
// To do so, we will use map as dictionary.
nodeList := make([]*html.Node, 0)
nodeDict := make(map[*html.Node]struct{})
var finder func(*html.Node)
finder = func(node *html.Node) {
if node.Type == html.ElementNode {
tag := dom.TagName(node)
if tag == "p" || tag == "pre" {
if _, exist := nodeDict[node]; !exist {
nodeList = append(nodeList, node)
nodeDict[node] = struct{}{}
}
} else if tag == "br" && node.Parent != nil && dom.TagName(node.Parent) == "div" {
if _, exist := nodeDict[node.Parent]; !exist {
nodeList = append(nodeList, node.Parent)
nodeDict[node.Parent] = struct{}{}
}
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
finder(child)
}
}
finder(doc)
// This is a little cheeky, we use the accumulator 'score' to decide what
// to return from this callback.
score := float64(0)
return ps.someNode(nodeList, func(node *html.Node) bool {
if !ps.isProbablyVisible(node) {
return false
}
matchString := dom.ClassName(node) + " " + dom.ID(node)
if rxUnlikelyCandidates.MatchString(matchString) &&
!rxOkMaybeItsACandidate.MatchString(matchString) {
return false
}
if dom.TagName(node) == "p" && ps.hasAncestorTag(node, "li", -1, nil) {
return false
}
nodeText := strings.TrimSpace(dom.TextContent(node))
nodeTextLength := charCount(nodeText)
if nodeTextLength < 140 {
return false
}
score += math.Sqrt(float64(nodeTextLength - 140))
if score > 20 {
return true
}
return false
})
}