diff --git a/README.md b/README.md index 427ff22..214b2ed 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,17 @@ In this example the value for first_value will remain 0 even though meta_pre2 al We love contributions! Open an issue, or fork/create a pull request. +## Develop Locally + +Since extractnet relies on several C++ modules, before starting to run locally you need to compile them first + +Usually what you need would be this command +``` +make +``` + +However, you can try to build it + # More details about the code structure Coming soon diff --git a/extractnet/nn_models.py b/extractnet/nn_models.py index 6bbf767..67fbd83 100644 --- a/extractnet/nn_models.py +++ b/extractnet/nn_models.py @@ -35,7 +35,11 @@ def preprocess(self, html): return feat, blocks - def predict(self, html): + def predict(self, html, top_rank=10): + ''' + html: HTML string or list of HTML string + top_rank: top K block which used to predict author, breadcrumbs(keywords), date + ''' single = False if isinstance(html, list): x, css, blocks= [], [], [] @@ -56,17 +60,17 @@ def predict(self, html): inputs_onnx = { 'input': x, 'css': css } logits = self.ort_session.run(None, inputs_onnx)[0] - decoded = self.decode_output(logits, blocks) + decoded = self.decode_output(logits, blocks, top_rank=top_rank) return decoded[0] if single else decoded - def decode_output(self, logits, doc_blocks): + def decode_output(self, logits, doc_blocks, top_rank=10): outputs = [] for jdx, preds in enumerate(logits): output = {} blocks = doc_blocks[jdx] for idx, label in enumerate(self.label_order): if label in ['author', 'date', 'breadcrumbs']: - top_k = 10 + top_k = min(top_rank, len(preds[:, idx])) scores = softmax([preds[:, idx]])[0] ind = np.argpartition(preds[:, idx], -top_k)[-top_k:] result = [ (fix_encoding(str_cast(blocks[idx].text)), scores[idx]) for idx in ind if scores[idx] > self.cls_threshold] diff --git a/provision.sh b/provision.sh index 0ce7fc5..232cd92 100755 --- a/provision.sh +++ b/provision.sh @@ -1,4 +1,4 @@ #!/bin/bash cython --cplus extractnet/*.pyx -cython --cplus extractnet/features//*.pyx \ No newline at end of file +cython --cplus extractnet/features/*.pyx \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 47a501d..5147815 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,6 @@ Cython>=0.21.1 +beautifulsoup4==4.9.3 +htmldate==1.2.3 ftfy>=4.1.0,<5.0.0 lxml numpy>=1.19.0 @@ -7,6 +9,7 @@ pytest-cov>=2.6.0 scikit-learn>=0.22.0 scipy>=0.17.0 sklearn-crfsuite==0.3.6 +tld==0.12.6 dateparser==1.1.0 joblib==0.17.0 onnxruntime==1.9.0 \ No newline at end of file