From 17775d3ee51179581ea72232aa0af34f1f416a0e Mon Sep 17 00:00:00 2001
From: theblackcat102 <theblackcat102@github.com>
Date: Mon, 11 Jul 2022 10:04:53 +0800
Subject: [PATCH] [feature] Fix issue #8

---
 README.md               | 11 +++++++++++
 extractnet/nn_models.py | 12 ++++++++----
 provision.sh            |  2 +-
 requirements-dev.txt    |  3 +++
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 427ff22..214b2ed 100644
--- a/README.md
+++ b/README.md
@@ -153,6 +153,17 @@ In this example the value for first_value will remain 0 even though meta_pre2 al
 We love contributions! Open an issue, or fork/create a pull
 request.
 
+## Develop Locally
+
+Since extractnet relies on several C++ modules, before starting to run locally you need to compile them first
+
+Usually what you need would be this command
+```
+make
+```
+
+However, you can try to build it 
+
 # More details about the code structure
 
 Coming soon
diff --git a/extractnet/nn_models.py b/extractnet/nn_models.py
index 6bbf767..67fbd83 100644
--- a/extractnet/nn_models.py
+++ b/extractnet/nn_models.py
@@ -35,7 +35,11 @@ def preprocess(self, html):
         return feat, blocks
 
 
-    def predict(self, html):
+    def predict(self, html, top_rank=10):
+        '''
+            html: HTML string or list of HTML string
+            top_rank: top K block which used to predict author, breadcrumbs(keywords), date
+        '''
         single = False
         if isinstance(html, list):
             x, css, blocks= [], [], []
@@ -56,17 +60,17 @@ def predict(self, html):
         inputs_onnx = { 'input': x, 'css': css }
 
         logits = self.ort_session.run(None, inputs_onnx)[0]
-        decoded = self.decode_output(logits, blocks)
+        decoded = self.decode_output(logits, blocks, top_rank=top_rank)
         return decoded[0] if single else decoded
 
-    def decode_output(self, logits, doc_blocks):
+    def decode_output(self, logits, doc_blocks, top_rank=10):
         outputs = []
         for jdx, preds in enumerate(logits):
             output = {}
             blocks = doc_blocks[jdx]
             for idx, label in enumerate(self.label_order):
                 if label in ['author', 'date', 'breadcrumbs']:
-                    top_k = 10
+                    top_k = min(top_rank, len(preds[:, idx]))
                     scores = softmax([preds[:, idx]])[0]
                     ind = np.argpartition(preds[:, idx], -top_k)[-top_k:]
                     result = [ (fix_encoding(str_cast(blocks[idx].text)), scores[idx]) for idx in ind if scores[idx] > self.cls_threshold]
diff --git a/provision.sh b/provision.sh
index 0ce7fc5..232cd92 100755
--- a/provision.sh
+++ b/provision.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
 cython --cplus extractnet/*.pyx
-cython --cplus extractnet/features//*.pyx
\ No newline at end of file
+cython --cplus extractnet/features/*.pyx
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 47a501d..5147815 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,4 +1,6 @@
 Cython>=0.21.1
+beautifulsoup4==4.9.3
+htmldate==1.2.3
 ftfy>=4.1.0,<5.0.0
 lxml
 numpy>=1.19.0
@@ -7,6 +9,7 @@ pytest-cov>=2.6.0
 scikit-learn>=0.22.0
 scipy>=0.17.0
 sklearn-crfsuite==0.3.6
+tld==0.12.6
 dateparser==1.1.0
 joblib==0.17.0
 onnxruntime==1.9.0
\ No newline at end of file