Skip to content

Commit

Permalink
feat: Development
Browse files Browse the repository at this point in the history
  • Loading branch information
the-superpirate committed Oct 26, 2023
1 parent c128041 commit 7c7aeb4
Show file tree
Hide file tree
Showing 57 changed files with 3,250 additions and 703 deletions.
8 changes: 6 additions & 2 deletions cybrex/cybrex/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ async def semantic_search(
n_chunks: int = 5,
n_documents: int = 10,
minimum_score: float = 0.5,
use_only_keywords: bool = True,
):
"""
Search related to query text chunks among `n` documents
Expand All @@ -126,20 +127,23 @@ async def semantic_search(
:param n_chunks: number of chunks to return
:param n_documents: the number of documents to extract from STC
:param minimum_score:
:param use_only_keywords:
"""
async with self.cybrex as cybrex:
print(f"{colored('Q', 'green')}: {query}")
scored_chunks = await cybrex.semantic_search(
query=query,
n_chunks=n_chunks,
n_documents=n_documents,
minimum_score=minimum_score
minimum_score=minimum_score,
use_only_keywords=use_only_keywords,
)
references = []
for scored_chunk in scored_chunks:
field, value = scored_chunk.chunk.document_id.split(':', 1)
document_id = f'{field}:{value}'
references.append(f' - {document_id}: {scored_chunk.chunk.title}\n {scored_chunk.chunk.text}')
title = scored_chunk.chunk.title.replace('\n', ' - ')
references.append(f' - {document_id}: {title}\n {scored_chunk.chunk.text}')
references = '\n'.join(references)
print(f"{colored('References', 'green')}:\n{references}")

Expand Down
15 changes: 12 additions & 3 deletions cybrex/cybrex/cybrex_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,15 @@ async def _get_missing_chunks(self, documents: List[SourceDocument], skip_downlo
'mode': 'cybrex',
'document_id': document.document_id,
})
document_chunks = await self.generate_chunks_from_document(document)
all_chunks.extend(document_chunks)
try:
document_chunks = await self.generate_chunks_from_document(document)
all_chunks.extend(document_chunks)
except ValueError:
logging.getLogger('statbox').info({
'action': 'broken_content',
'mode': 'cybrex',
'document_id': document.document_id,
})
return all_chunks

async def _search_in_vector_storage(self, query: str, n_chunks: int = 3,
Expand Down Expand Up @@ -363,6 +370,7 @@ async def semantic_search(
n_documents: int = 30,
minimum_score: float = 0.5,
skip_downloading_pdf: bool = True,
use_only_keywords: bool = True,
) -> List[ScoredChunk]:
"""
Flow for retrieving chunks by chunking documents relevant to `query`
Expand All @@ -372,9 +380,10 @@ async def semantic_search(
:param n_chunks:
:param n_documents:
:param minimum_score:
:param use_only_keywords:
:return:
"""
documents = await self.search_documents(query, n_documents, use_only_keywords=True)
documents = await self.search_documents(query, n_documents, use_only_keywords=use_only_keywords)
await self.upsert_documents(documents, skip_downloading_pdf=skip_downloading_pdf)
return await self._search_in_vector_storage(
query=query,
Expand Down
13 changes: 12 additions & 1 deletion cybrex/cybrex/data_source/geck_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,18 @@ async def search_documents(
) -> List[SourceDocument]:
documents = await self.geck.get_summa_client().search_documents({
'index_alias': self.geck.index_alias,
'query': {'match': {'value': query.lower()}},
'query': {'boolean': {'subqueries': [
{'occur': 'must', 'query': {'match': {'value': query.lower()}}},
{'occur': 'must', 'query': {'term': {'field': 'languages', 'value': 'en'}}},
{'occur': 'must', 'query': {'boolean': {'subqueries': [
{'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'book'}}},
{'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'edited-book'}}},
{'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'monograph'}}},
{'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'reference-book'}}},
{'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'journal-article'}}},
{'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'wiki'}}},
]}}},
]}},
'collectors': [{'top_docs': {'limit': limit}}],
'is_fieldnorms_scoring_enabled': False,
})
Expand Down
44 changes: 35 additions & 9 deletions cybrex/cybrex/document_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,18 @@
'acknowledgements',
'supporting information',
'conflict of interest disclosures',
'conflict of interest',
'conflict of interest statement',
'ethics statement',
'references',
'external links',
'further reading',
'works cited',
'bibliography',
'notes',
'sources',
'footnotes',
'suggested readings',
}


Expand Down Expand Up @@ -66,7 +76,7 @@ def chunk_by_title(
first_element = section[0]

if isinstance(first_element, unstructured.documents.elements.Title):
current_title_parts[first_element.metadata.category_depth] = re.sub('\n+', ' ', str(first_element))
current_title_parts[first_element.metadata.category_depth] = re.sub('\n+', ' ', str(first_element).strip())
last_title_parts = first_element.metadata.category_depth
continue

Expand Down Expand Up @@ -126,24 +136,41 @@ def to_chunks(self, source_document: SourceDocument) -> List[Chunk]:
for section in list(soup.find_all('section')):
for child in section.children:
if (
child.name in {'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
and child.text.lower().strip() in BANNED_SECTIONS
child.name in {'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div'}
and child.text.lower().strip(' :,.;') in BANNED_SECTIONS
):
section.extract()
break
break

for summary in list(soup.select('details > summary.section-heading')):
if summary.text.lower().strip(' :,.;') in BANNED_SECTIONS:
summary.parent.extract()

for b_tag in list(soup.select('b, i')):
b_tag.unwrap()

for p_tag in list(soup.find_all('p')):
sibling = p_tag.next_sibling
while sibling == '\n':
sibling = sibling.next_sibling
if sibling and sibling.name == 'blockquote':
new_p_tag = soup.new_tag('p')
new_p_tag.extend([p_tag.text, ' ', sibling.text])
p_tag.replace_with(new_p_tag)
sibling.extract()

for header in list(soup.find_all('header')):
header.name = 'h1'

for el in list(soup.select('nav, ref, formula, math, figure, .Affiliations, '
for el in list(soup.select('table, nav, ref, formula, math, figure, img, [role="note"], .Affiliations, '
'.ArticleOrChapterToc, '
'.AuthorGroup, .ChapterContextInformation, '
'.Contacts, .CoverFigure, .Bibliography, '
'.BookTitlePage, .BookFrontmatter, .CopyrightPage, .Equation, '
'.FootnoteSection, .Table')):
'.FootnoteSection, .Table, .reference, .side-box-text, .thumbcaption')):
el.extract()

for el in list(soup.select('a')):
for el in list(soup.select('a, span')):
el.unwrap()

text = str(soup)
Expand All @@ -159,7 +186,6 @@ def to_chunks(self, source_document: SourceDocument) -> List[Chunk]:

chunks = []
chunk_id = 0

elements = chunk_by_title(elements)
for element in elements:
for chunk in self.text_splitter.split_text(str(element)):
Expand All @@ -170,7 +196,7 @@ def to_chunks(self, source_document: SourceDocument) -> List[Chunk]:
title_parts = [document["title"]]
if self.add_metadata:
if element.metadata.section:
title_parts.extend(element.metadata.section.split('\n'))
title_parts.extend(filter(bool, element.metadata.section.split('\n')))
parts.append(f'TITLE: {" ".join(title_parts)}')
if 'issued_at' in document:
issued_at = datetime.utcfromtimestamp(document['issued_at'])
Expand Down
3 changes: 2 additions & 1 deletion cybrex/cybrex/vector_storage/qdrant.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import dataclasses
import hashlib
import uuid
from typing import (
Iterable,
Expand Down Expand Up @@ -166,7 +167,7 @@ def upsert(self, chunks: List[Chunk]):
collection_name=self.collection_name,
points=[
PointStruct(
id=str(uuid.uuid1()),
id=hashlib.md5(f'{chunk.document_id}@{chunk.chunk_id}'.encode()).hexdigest(),
vector=embedding,
payload=dataclasses.asdict(chunk)
)
Expand Down
75 changes: 75 additions & 0 deletions cybrex/examples/on-the-fly-translation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from transformers import MBartForConditionalGeneration, MBart50TokenizerFast\n",
"\n",
"model = MBartForConditionalGeneration.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
"tokenizer = MBart50TokenizerFast.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"tokenizer.lang_code_to_id"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"article = \"Forty-two patients operated on by skin expansion have been contacted after a mean time of 25 months from the last surgery. Two biopsies have been taken from the expanded area of each patient. In 12 patients it has been possible to obtain a similar sampling from the opposite, nonexpanded area of the body. The samples underwent optic microscopy and cell kinetic and DNA content investigations. The epidermal structure of the followed-up skin, compared with the skin of the opposite side of the body, looks normal. The mitotic activity of the epidermal cells has returned to the values of preexpanded skin. The dermis shows a low degree of elastosis and zonal fragmentation of elastic fibers. The hypodermis, where the expander capsule was removed during the last surgery, does not show an accentuated fibrosis.\"\n",
"tokenizer.src_lang = \"en_XX\"\n",
"inputs = tokenizer(article, return_tensors=\"pt\")\n",
"\n",
"translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[\"ru_RU\"], max_length=1024)\n",
"tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
2 changes: 1 addition & 1 deletion cybrex/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cybrex"
version = "1.10.4"
version = "1.11.11"
authors = [{ name = "Interdimensional Walker" }]
description = "Researching AI"
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion cybrex/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ pyyaml>=6.0
qdrant_client>=1.5.4
tiktoken>=0.5.1
safetensors==0.3.1
stc-geck>=1.8.32
stc-geck>=1.8.35
unstructured[html]>=0.10.21
2 changes: 1 addition & 1 deletion geck/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "stc-geck"
version = "1.8.33"
version = "1.8.35"
authors = [{ name = "Interdimensional Walker" }]
description = "GECK (Garden Of Eden Creation Kit) is a toolkit for setting up and maintaning STC"
readme = "README.md"
Expand Down
3 changes: 3 additions & 0 deletions geck/stc_geck/advices.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
'pubmed_id': 'id.pubmed_id',
'rd': 'references.doi',
'ser': 'metadata.series',
'wiki': 'id.wiki',
}


Expand Down Expand Up @@ -209,6 +210,8 @@ def get_internal_id(self):
return f'id.zlibrary_ids:{self.zlibrary_ids[-1]}'
elif self.nexus_id:
return f'id.nexus_id:{self.nexus_id}'
elif self.wiki:
return f'id.wiki'
else:
return None

Expand Down
2 changes: 1 addition & 1 deletion library/textutils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
TELEGRAM_LINK_REGEX = re.compile('(?:https?://)?t\\.me/(?!joinchat/)([A-Za-z0-9_]+)')

DOI_WILDCARD_REGEX_TEXT = r'(10.\d{4,9}).*\.\*'
DOI_REGEX_TEXT = r'(?:doi.org/)?(10.\d{4,9})\s?/\s?([-._;()<>/:A-Za-z0-9]+[^.?\s])'
DOI_REGEX_TEXT = r'(?:doi.org/)?(10.\d{4,9})\s?(?:/|%2[Ff])\s?([%-._;()<>/:A-Za-z0-9]+[^.?\s])'
DOI_REGEX = re.compile(DOI_REGEX_TEXT)
ISBN_REGEX = re.compile(r'^(?:[iI][sS][bB][nN]\:?\s*)?((97(8|9))?\-?\d{9}(\d|X))$')
MD5_REGEX = re.compile(r'([A-Fa-f0-9]{32})')
Expand Down
8 changes: 7 additions & 1 deletion tgbot/app/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,12 @@ def __init__(self, config):
)
self.starts.append(self.cybrex_ai)

self.ipfs_http_client = IpfsHttpClient(base_url=config['ipfs']['http']['base_url'], retry_delay=5.0)
self.ipfs_http_client = IpfsHttpClient(
base_url=config['ipfs']['http']['base_url'],
max_retries=2,
retry_delay=5.0,
timeout=600,
)
self.starts.append(self.ipfs_http_client)

self.cloudflare_ipfs_http_client = IpfsHttpClient(base_url='https://cloudflare-ipfs.com', retry_delay=5.0)
Expand Down Expand Up @@ -142,6 +147,7 @@ def __init__(self, config):
self.promotioner = Promotioner(
promotions=get_promotions(),
promotion_vars=dict(
reddit_url=config['reddit']['url'],
twitter_contact_url=self.config['twitter']['contact_url'],
related_channel=self.config['telegram']['related_channel'],
)
Expand Down
2 changes: 1 addition & 1 deletion tgbot/app/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ async def add_approve(self, message_id, decision):
""", (message_id, decision))
await self.users_db_wrapper.db.commit()

async def add_vote_broken_file(self, bot_name, user_id, internal_id, cid):
async def add_vote_broken_file(self, bot_name, user_id, internal_id, cid, reason):
await self.users_db_wrapper.db.execute("""
INSERT OR IGNORE into file_votes(bot_name, user_id, doi, cid, vote) VALUES (?, ?, ?, ?, ?)
""", (bot_name, user_id, internal_id, cid, -1))
Expand Down
2 changes: 2 additions & 0 deletions tgbot/configs/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ metadata_retriever:
enabled: false
# Index for committing changes
index_alias: nexus_science
reddit:
url: https://reddit.com/r/science_nexus
telegram:
# Enabled handlers
command_handlers:
Expand Down
4 changes: 2 additions & 2 deletions tgbot/handlers/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,12 @@ async def _on_fail():
)
try:
thumb_task = asyncio.create_task(download_thumb(self.document_holder.isbns))
file = await self.download_document(
file = await asyncio.wait_for(self.download_document(
cid=self.download_link['cid'],
progress_bar=progress_bar_download,
request_context=request_context,
filesize=self.download_link.get('filesize'),
)
), timeout=600)
if file:
request_context.statbox(
action='downloaded',
Expand Down
1 change: 1 addition & 0 deletions tgbot/handlers/howtohelp.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ async def handler(self, event: events.ChatAction, request_context: RequestContex
request_context.statbox(action='show', mode='howtohelp')
await event.reply(
t('HOW_TO_HELP', request_context.chat['language']).format(
reddit_url=config['reddit'].get('url', '🚫'),
related_channel=config['telegram'].get('related_channel', '🚫'),
twitter_contact_url=config['twitter'].get('contact_url', '🚫')
))
Loading

0 comments on commit 7c7aeb4

Please sign in to comment.