forked from hjdhnx/dr_py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.ym.js
158 lines (150 loc) · 4.63 KB
/
util.ym.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import 'assets://js/lib/uri.min.js'
import cheerio from 'assets://js/lib/cheerio.min.js';
import 'assets://js/lib/crypto-js.js'
var charStr = 'abacdefghjklmnopqrstuvwxyzABCDEFGHJKLMNOPQRSTUVWXYZ0123456789';
export function randIndex(min, max, i) {
let index = Math.floor(Math.random() * (max - min + 1) + min),
numStart = charStr.length - 10;
if (i == 0 && index >= numStart) {
index = randIndex(min, max, i);
}
return index;
}
export function randomStr(len) {
let min = 0, max = charStr.length - 1, _str = '';
len = len || 15;
for (var i = 0, index; i < len; i++) {
index = randIndex(min, max, i);
_str += charStr[index];
}
return _str;
}
export function urljoin(base, url) {
base = base || '';
url = url || '';
let baseU = new Uri(base.trim().rstrip('/'));
url = url.trim().rstrip('/');
let u = undefined;
if (url.startsWith('http://') || url.startsWith('https://')) {
u = new Uri(url);
} else if (url.startsWith('://')) {
u = new Uri(baseU.protocol() + url);
} else if (url.startsWith('//')) {
u = new Uri(baseU.protocol() + ':' + url);
} else {
u = new Uri(baseU.protocol() + '://' + baseU.host() + (baseU.port() ? ':' + baseU.port() : '') + '/' + url);
}
if ((!u.path() || u.path().trim().length === 0) && baseU.path())
u.path(baseU.path());
if (!u.query() && baseU.query())
u.query(baseU.query());
return u.toString();
}
const DOM_CHECK_ATTR = /(url|src|href|data-original|data-src)$/;
const SELECT_REGEX = /:eq|:lt|:gt|#/g;
const SELECT_REGEX_A = /:eq|:lt|:gt/g;
export function pdfh(html, parse, base_url) {
if (!parse || !parse.trim()) {
return ''
}
let eleFind = typeof html === 'object';
let option = undefined;
if (eleFind && parse.startsWith('body&&')) {
parse = parse.substr(6);
if (parse.indexOf('&&') < 0) {
option = parse.trim();
parse = '*=*';
}
}
if (parse.indexOf('&&') > -1) {
let sp = parse.split('&&');
option = sp[sp.length - 1];
sp.splice(sp.length - 1);
if (sp.length > 1) {
for (let i in sp) {
if (!SELECT_REGEX.test(sp[i])) {
sp[i] = sp[i] + ':eq(0)';
}
}
} else {
if (!SELECT_REGEX.test(sp[0])) {
sp[0] = sp[0] + ':eq(0)';
}
}
parse = sp.join(' ');
}
let result = '';
const $ = eleFind ? html.rr : cheerio.load(html);
let ret = eleFind ? ((parse === '*=*' || $(html.ele).is(parse)) ? html.ele : $(html.ele).find(parse)) : $(parse);
if (option) {
if (option === 'Text') {
result = $(ret).text();
}
else if (option === 'Html') {
result = $(ret).html();
}
else {
result = $(ret).attr(option);
}
if (result && base_url && DOM_CHECK_ATTR.test(option)) {
if (/http/.test(result)) {
result = result.substr(result.indexOf('http'));
} else {
result = urljoin(base_url, result)
}
}
} else {
result = $(ret).toString();
}
return result;
}
export function pdfa(html, parse) {
if (!parse || !parse.trim()) {
return [];
}
let eleFind = typeof html === 'object';
if (parse.indexOf('&&') > -1) {
let sp = parse.split('&&');
for (let i in sp) {
if (!SELECT_REGEX_A.test(sp[i]) && i < sp.length - 1) {
sp[i] = sp[i] + ':eq(0)';
}
}
parse = sp.join(' ');
}
const $ = eleFind ? html.rr : cheerio.load(html);
let ret = eleFind ? ($(html.ele).is(parse) ? html.ele : $(html.ele).find(parse)) : $(parse);
let result = [];
if (ret) {
ret.each(function (idx, ele) {
result.push({ rr: $, ele: ele });
});
}
return result;
}
const defaultParser = {
pdfh:pdfh,
pdfa:pdfa,
pd(html,parse,uri){
let ret = this.pdfh(html,parse);
if(typeof(uri)==='undefined'||!uri){
uri = '';
}
if(DOM_CHECK_ATTR.test(parse)){
if(/http/.test(ret)){
ret = ret.substr(ret.indexOf('http'));
}else{
ret = urljoin(MY_URL,ret)
}
}
return ret
},
};
globalThis.randIndex = randIndex;
globalThis.randomStr = randomStr;
globalThis.urljoin = urljoin;
globalThis.joinUrl = urljoin;
globalThis.defaultParser = defaultParser;
globalThis.pdfa = defaultParser.pdfa;
globalThis.pdfh = defaultParser.pdfh;
globalThis.pd = defaultParser.pd;