feat: Development

abuaog · Oct 26, 2023 · 7c7aeb4 · 7c7aeb4
1 parent c128041
commit 7c7aeb4
Show file tree

Hide file tree

Showing 57 changed files with 3,250 additions and 703 deletions.
diff --git a/cybrex/cybrex/cli.py b/cybrex/cybrex/cli.py
@@ -118,6 +118,7 @@ async def semantic_search(
         n_chunks: int = 5,
         n_documents: int = 10,
         minimum_score: float = 0.5,
+        use_only_keywords: bool = True,
     ):
         """
         Search related to query text chunks among `n` documents
@@ -126,20 +127,23 @@ async def semantic_search(
         :param n_chunks: number of chunks to return
         :param n_documents: the number of documents to extract from STC
         :param minimum_score:
+        :param use_only_keywords:
         """
         async with self.cybrex as cybrex:
             print(f"{colored('Q', 'green')}: {query}")
             scored_chunks = await cybrex.semantic_search(
                 query=query,
                 n_chunks=n_chunks,
                 n_documents=n_documents,
-                minimum_score=minimum_score
+                minimum_score=minimum_score,
+                use_only_keywords=use_only_keywords,
             )
             references = []
             for scored_chunk in scored_chunks:
                 field, value = scored_chunk.chunk.document_id.split(':', 1)
                 document_id = f'{field}:{value}'
-                references.append(f' - {document_id}: {scored_chunk.chunk.title}\n   {scored_chunk.chunk.text}')
+                title = scored_chunk.chunk.title.replace('\n', ' - ')
+                references.append(f' - {document_id}: {title}\n   {scored_chunk.chunk.text}')
             references = '\n'.join(references)
             print(f"{colored('References', 'green')}:\n{references}")
 

diff --git a/cybrex/cybrex/cybrex_ai.py b/cybrex/cybrex/cybrex_ai.py
@@ -120,8 +120,15 @@ async def _get_missing_chunks(self, documents: List[SourceDocument], skip_downlo
                 'mode': 'cybrex',
                 'document_id': document.document_id,
             })
-            document_chunks = await self.generate_chunks_from_document(document)
-            all_chunks.extend(document_chunks)
+            try:
+                document_chunks = await self.generate_chunks_from_document(document)
+                all_chunks.extend(document_chunks)
+            except ValueError:
+                logging.getLogger('statbox').info({
+                    'action': 'broken_content',
+                    'mode': 'cybrex',
+                    'document_id': document.document_id,
+                })
         return all_chunks
 
     async def _search_in_vector_storage(self, query: str, n_chunks: int = 3,
@@ -363,6 +370,7 @@ async def semantic_search(
         n_documents: int = 30,
         minimum_score: float = 0.5,
         skip_downloading_pdf: bool = True,
+        use_only_keywords: bool = True,
     ) -> List[ScoredChunk]:
         """
         Flow for retrieving chunks by chunking documents relevant to `query`
@@ -372,9 +380,10 @@ async def semantic_search(
         :param n_chunks:
         :param n_documents:
         :param minimum_score:
+        :param use_only_keywords:
         :return:
         """
-        documents = await self.search_documents(query, n_documents, use_only_keywords=True)
+        documents = await self.search_documents(query, n_documents, use_only_keywords=use_only_keywords)
         await self.upsert_documents(documents, skip_downloading_pdf=skip_downloading_pdf)
         return await self._search_in_vector_storage(
             query=query,

diff --git a/cybrex/cybrex/data_source/geck_data_source.py b/cybrex/cybrex/data_source/geck_data_source.py
@@ -24,7 +24,18 @@ async def search_documents(
     ) -> List[SourceDocument]:
         documents = await self.geck.get_summa_client().search_documents({
             'index_alias': self.geck.index_alias,
-            'query': {'match': {'value': query.lower()}},
+            'query': {'boolean': {'subqueries': [
+                {'occur': 'must', 'query': {'match': {'value': query.lower()}}},
+                {'occur': 'must', 'query': {'term': {'field': 'languages', 'value': 'en'}}},
+                {'occur': 'must', 'query': {'boolean': {'subqueries': [
+                    {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'book'}}},
+                    {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'edited-book'}}},
+                    {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'monograph'}}},
+                    {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'reference-book'}}},
+                    {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'journal-article'}}},
+                    {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'wiki'}}},
+                ]}}},
+            ]}},
             'collectors': [{'top_docs': {'limit': limit}}],
             'is_fieldnorms_scoring_enabled': False,
         })

diff --git a/cybrex/cybrex/document_chunker.py b/cybrex/cybrex/document_chunker.py
@@ -26,8 +26,18 @@
     'acknowledgements',
     'supporting information',
     'conflict of interest disclosures',
+    'conflict of interest',
     'conflict of interest statement',
     'ethics statement',
+    'references',
+    'external links',
+    'further reading',
+    'works cited',
+    'bibliography',
+    'notes',
+    'sources',
+    'footnotes',
+    'suggested readings',
 }
 
 
@@ -66,7 +76,7 @@ def chunk_by_title(
         first_element = section[0]
 
         if isinstance(first_element, unstructured.documents.elements.Title):
-            current_title_parts[first_element.metadata.category_depth] = re.sub('\n+', ' ', str(first_element))
+            current_title_parts[first_element.metadata.category_depth] = re.sub('\n+', ' ', str(first_element).strip())
             last_title_parts = first_element.metadata.category_depth
             continue
 
@@ -126,24 +136,41 @@ def to_chunks(self, source_document: SourceDocument) -> List[Chunk]:
         for section in list(soup.find_all('section')):
             for child in section.children:
                 if (
-                    child.name in {'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
-                    and child.text.lower().strip() in BANNED_SECTIONS
+                    child.name in {'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div'}
+                    and child.text.lower().strip(' :,.;') in BANNED_SECTIONS
                 ):
                     section.extract()
-                break
+                    break
+
+        for summary in list(soup.select('details > summary.section-heading')):
+            if summary.text.lower().strip(' :,.;') in BANNED_SECTIONS:
+                summary.parent.extract()
+
+        for b_tag in list(soup.select('b, i')):
+            b_tag.unwrap()
+
+        for p_tag in list(soup.find_all('p')):
+            sibling = p_tag.next_sibling
+            while sibling == '\n':
+                sibling = sibling.next_sibling
+            if sibling and sibling.name == 'blockquote':
+                new_p_tag = soup.new_tag('p')
+                new_p_tag.extend([p_tag.text, ' ', sibling.text])
+                p_tag.replace_with(new_p_tag)
+                sibling.extract()
 
         for header in list(soup.find_all('header')):
             header.name = 'h1'
 
-        for el in list(soup.select('nav, ref, formula, math, figure, .Affiliations, '
+        for el in list(soup.select('table, nav, ref, formula, math, figure, img, [role="note"], .Affiliations, '
                                    '.ArticleOrChapterToc, '
                                    '.AuthorGroup, .ChapterContextInformation, '
                                    '.Contacts, .CoverFigure, .Bibliography, '
                                    '.BookTitlePage, .BookFrontmatter, .CopyrightPage, .Equation, '
-                                   '.FootnoteSection, .Table')):
+                                   '.FootnoteSection, .Table, .reference, .side-box-text, .thumbcaption')):
             el.extract()
 
-        for el in list(soup.select('a')):
+        for el in list(soup.select('a, span')):
             el.unwrap()
 
         text = str(soup)
@@ -159,7 +186,6 @@ def to_chunks(self, source_document: SourceDocument) -> List[Chunk]:
 
         chunks = []
         chunk_id = 0
-
         elements = chunk_by_title(elements)
         for element in elements:
             for chunk in self.text_splitter.split_text(str(element)):
@@ -170,7 +196,7 @@ def to_chunks(self, source_document: SourceDocument) -> List[Chunk]:
                 title_parts = [document["title"]]
                 if self.add_metadata:
                     if element.metadata.section:
-                        title_parts.extend(element.metadata.section.split('\n'))
+                        title_parts.extend(filter(bool, element.metadata.section.split('\n')))
                     parts.append(f'TITLE: {" ".join(title_parts)}')
                     if 'issued_at' in document:
                         issued_at = datetime.utcfromtimestamp(document['issued_at'])

diff --git a/cybrex/cybrex/vector_storage/qdrant.py b/cybrex/cybrex/vector_storage/qdrant.py
@@ -1,4 +1,5 @@
 import dataclasses
+import hashlib
 import uuid
 from typing import (
     Iterable,
@@ -166,7 +167,7 @@ def upsert(self, chunks: List[Chunk]):
             collection_name=self.collection_name,
             points=[
                 PointStruct(
-                    id=str(uuid.uuid1()),
+                    id=hashlib.md5(f'{chunk.document_id}@{chunk.chunk_id}'.encode()).hexdigest(),
                     vector=embedding,
                     payload=dataclasses.asdict(chunk)
                 )

diff --git a/cybrex/examples/on-the-fly-translation.ipynb b/cybrex/examples/on-the-fly-translation.ipynb
@@ -0,0 +1,75 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import MBartForConditionalGeneration, MBart50TokenizerFast\n",
+    "\n",
+    "model = MBartForConditionalGeneration.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
+    "tokenizer = MBart50TokenizerFast.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "tokenizer.lang_code_to_id"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "article = \"Forty-two patients operated on by skin expansion have been contacted after a mean time of 25 months from the last surgery. Two biopsies have been taken from the expanded area of each patient. In 12 patients it has been possible to obtain a similar sampling from the opposite, nonexpanded area of the body. The samples underwent optic microscopy and cell kinetic and DNA content investigations. The epidermal structure of the followed-up skin, compared with the skin of the opposite side of the body, looks normal. The mitotic activity of the epidermal cells has returned to the values of preexpanded skin. The dermis shows a low degree of elastosis and zonal fragmentation of elastic fibers. The hypodermis, where the expander capsule was removed during the last surgery, does not show an accentuated fibrosis.\"\n",
+    "tokenizer.src_lang = \"en_XX\"\n",
+    "inputs = tokenizer(article, return_tensors=\"pt\")\n",
+    "\n",
+    "translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[\"ru_RU\"], max_length=1024)\n",
+    "tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/cybrex/pyproject.toml b/cybrex/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cybrex"
-version = "1.10.4"
+version = "1.11.11"
 authors = [{ name = "Interdimensional Walker" }]
 description = "Researching AI"
 readme = "README.md"

diff --git a/cybrex/requirements.txt b/cybrex/requirements.txt
@@ -16,5 +16,5 @@ pyyaml>=6.0
 qdrant_client>=1.5.4
 tiktoken>=0.5.1
 safetensors==0.3.1
-stc-geck>=1.8.32
+stc-geck>=1.8.35
 unstructured[html]>=0.10.21
diff --git a/geck/pyproject.toml b/geck/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "stc-geck"
-version = "1.8.33"
+version = "1.8.35"
 authors = [{ name = "Interdimensional Walker" }]
 description = "GECK (Garden Of Eden Creation Kit) is a toolkit for setting up and maintaning STC"
 readme = "README.md"

diff --git a/geck/stc_geck/advices.py b/geck/stc_geck/advices.py
@@ -31,6 +31,7 @@
     'pubmed_id': 'id.pubmed_id',
     'rd': 'references.doi',
     'ser': 'metadata.series',
+    'wiki': 'id.wiki',
 }
 
 
@@ -209,6 +210,8 @@ def get_internal_id(self):
             return f'id.zlibrary_ids:{self.zlibrary_ids[-1]}'
         elif self.nexus_id:
             return f'id.nexus_id:{self.nexus_id}'
+        elif self.wiki:
+            return f'id.wiki'
         else:
             return None
 

diff --git a/library/textutils/__init__.py b/library/textutils/__init__.py
@@ -24,7 +24,7 @@
 TELEGRAM_LINK_REGEX = re.compile('(?:https?://)?t\\.me/(?!joinchat/)([A-Za-z0-9_]+)')
 
 DOI_WILDCARD_REGEX_TEXT = r'(10.\d{4,9}).*\.\*'
-DOI_REGEX_TEXT = r'(?:doi.org/)?(10.\d{4,9})\s?/\s?([-._;()<>/:A-Za-z0-9]+[^.?\s])'
+DOI_REGEX_TEXT = r'(?:doi.org/)?(10.\d{4,9})\s?(?:/|%2[Ff])\s?([%-._;()<>/:A-Za-z0-9]+[^.?\s])'
 DOI_REGEX = re.compile(DOI_REGEX_TEXT)
 ISBN_REGEX = re.compile(r'^(?:[iI][sS][bB][nN]\:?\s*)?((97(8|9))?\-?\d{9}(\d|X))$')
 MD5_REGEX = re.compile(r'([A-Fa-f0-9]{32})')

diff --git a/tgbot/app/application.py b/tgbot/app/application.py
@@ -76,7 +76,12 @@ def __init__(self, config):
             )
             self.starts.append(self.cybrex_ai)
 
-        self.ipfs_http_client = IpfsHttpClient(base_url=config['ipfs']['http']['base_url'], retry_delay=5.0)
+        self.ipfs_http_client = IpfsHttpClient(
+            base_url=config['ipfs']['http']['base_url'],
+            max_retries=2,
+            retry_delay=5.0,
+            timeout=600,
+        )
         self.starts.append(self.ipfs_http_client)
 
         self.cloudflare_ipfs_http_client = IpfsHttpClient(base_url='https://cloudflare-ipfs.com', retry_delay=5.0)
@@ -142,6 +147,7 @@ def __init__(self, config):
         self.promotioner = Promotioner(
             promotions=get_promotions(),
             promotion_vars=dict(
+                reddit_url=config['reddit']['url'],
                 twitter_contact_url=self.config['twitter']['contact_url'],
                 related_channel=self.config['telegram']['related_channel'],
             )

diff --git a/tgbot/app/database.py b/tgbot/app/database.py
@@ -120,7 +120,7 @@ async def add_approve(self, message_id, decision):
         """, (message_id, decision))
         await self.users_db_wrapper.db.commit()
 
-    async def add_vote_broken_file(self, bot_name, user_id, internal_id, cid):
+    async def add_vote_broken_file(self, bot_name, user_id, internal_id, cid, reason):
         await self.users_db_wrapper.db.execute("""
         INSERT OR IGNORE into file_votes(bot_name, user_id, doi, cid, vote) VALUES (?, ?, ?, ?, ?)
         """, (bot_name, user_id, internal_id, cid, -1))

diff --git a/tgbot/configs/base.yaml b/tgbot/configs/base.yaml
@@ -71,6 +71,8 @@ metadata_retriever:
   enabled: false
   # Index for committing changes
   index_alias: nexus_science
+reddit:
+  url: https://reddit.com/r/science_nexus
 telegram:
   # Enabled handlers
   command_handlers:

diff --git a/tgbot/handlers/download.py b/tgbot/handlers/download.py
@@ -120,12 +120,12 @@ async def _on_fail():
             )
             try:
                 thumb_task = asyncio.create_task(download_thumb(self.document_holder.isbns))
-                file = await self.download_document(
+                file = await asyncio.wait_for(self.download_document(
                     cid=self.download_link['cid'],
                     progress_bar=progress_bar_download,
                     request_context=request_context,
                     filesize=self.download_link.get('filesize'),
-                )
+                ), timeout=600)
                 if file:
                     request_context.statbox(
                         action='downloaded',

diff --git a/tgbot/handlers/howtohelp.py b/tgbot/handlers/howtohelp.py
@@ -15,6 +15,7 @@ async def handler(self, event: events.ChatAction, request_context: RequestContex
         request_context.statbox(action='show', mode='howtohelp')
         await event.reply(
             t('HOW_TO_HELP', request_context.chat['language']).format(
+                reddit_url=config['reddit'].get('url', '🚫'),
                 related_channel=config['telegram'].get('related_channel', '🚫'),
                 twitter_contact_url=config['twitter'].get('contact_url', '🚫')
             ))