Skip to content

Commit

Permalink
Version 0.1.2. Updating PyTextRankPhrases.py to support PyTextRank 3.…
Browse files Browse the repository at this point in the history
… Replacing spacy.load('en') with spacy.load('en_core_web_sm'). Fixing a failing test case.
  • Loading branch information
JasonKessler committed Mar 8, 2021
1 parent 8242767 commit 8ddff82
Show file tree
Hide file tree
Showing 22 changed files with 116 additions and 41 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
[![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)

# Scattertext 0.1.1
# Scattertext 0.1.2

A tool for finding distinguishing terms in corpora and displaying them in an
interactive HTML scatter plot. Points corresponding to terms are selectively labeled
Expand Down Expand Up @@ -492,6 +492,8 @@ import pytextrank, spacy
import scattertext as st
nlp = spacy.load('en')
nlp.add_pipe("textrank", last=True)
convention_df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(nlp),
party=lambda df: df.party.apply({'democrat': 'Democratic', 'republican': 'Republican'}.get)
Expand Down
2 changes: 1 addition & 1 deletion demo_custom_coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.Scalers import scale

nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
Expand Down
2 changes: 1 addition & 1 deletion demo_expected_vs_actual.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
Expand Down
2 changes: 1 addition & 1 deletion demo_four_square.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import scattertext as st

nlp = spacy.load('en', parser=False)
nlp = spacy.load('en_core_web_sm', parser=False)
t0 = time.time()
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')
reviews_df['parse'] = reviews_df['review'].apply(st.whitespace_nlp_with_sentences)
Expand Down
2 changes: 1 addition & 1 deletion demo_gensim_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def main():
nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
#nlp = whitespace_nlp_with_sentences
convention_df = SampleCorpora.ConventionData2012.get_data()
convention_df['parsed'] = convention_df.text.apply(nlp)
Expand Down
2 changes: 1 addition & 1 deletion demo_insignificant_greyed_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.LogOddsUniformativePriorScore import LogOddsUninformativePriorScore

nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
Expand Down
2 changes: 1 addition & 1 deletion demo_lemmas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import scattertext as st
import spacy
nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')

df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(nlp)
Expand Down
2 changes: 1 addition & 1 deletion demo_names.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import scattertext as st
import spacy

nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')

df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: list(nlp.pipe(df.text))
Expand Down
2 changes: 1 addition & 1 deletion demo_phrase_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
category_col='party',
text_col='text',
feats_from_spacy_doc=PhraseMachinePhrases(),
nlp=spacy.load('en', parser=False))
nlp=spacy.load('en_core_web_sm', parser=False))
.build().compact(AssociationCompactor(4000)))

html = produce_scattertext_explorer(corpus,
Expand Down
4 changes: 2 additions & 2 deletions demo_pytextrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import numpy as np
import pytextrank

nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("textrank", last=True)

convention_df = SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(nlp),
Expand Down Expand Up @@ -63,7 +64,6 @@
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % file_name)


html = produce_scattertext_explorer(
corpus,
category='Democratic',
Expand Down
2 changes: 1 addition & 1 deletion demo_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


def main():
nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
Expand Down
2 changes: 1 addition & 1 deletion demo_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from scattertext import SampleCorpora, sparse_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
Expand Down
2 changes: 1 addition & 1 deletion demo_with_apostrophes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import scattertext as st
import spacy

nlp = spacy.blank('en')
nlp = spacy.blank('en_core_web_sm')
nlp.tokenizer.rules = {key: value for key, value in nlp.tokenizer.rules.items()
if "'" not in key and "’" not in key and "‘" not in key}
nlp.add_pipe(nlp.create_pipe('sentencizer'))
Expand Down
4 changes: 2 additions & 2 deletions scattertext/CLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def main():
help="If present, don't use spaCy for preprocessing. Instead, "
"use a simple, dumb, regex.")
parser.add_argument('--spacy_language_model', action='store',
dest='spacy_language_model', default='en',
help="If present, pick the spaCy language model to use. Default is 'en'. "
dest='spacy_language_model', default='en_core_web_sm',
help="If present, pick the spaCy language model to use. Default is 'en_core_web_sm'. "
"Other valid values include 'de' and 'fr'. --regex_parser will override."
"Please see https://spacy.io/docs/api/language-models for moredetails")
parser.add_argument('--one_use_per_doc', action='store_true',
Expand Down
6 changes: 2 additions & 4 deletions scattertext/TermDocMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,11 +764,8 @@ def use_external_metadata_lists(self, metadata_lists):
metadata_index_store = IndexStore()
metadata_csr_factory = CSRMatrixFactory()
assert len(metadata_lists) == self.get_num_docs()
print("STARTING")
for doc_i, metadata_list in enumerate(metadata_lists):
print("L", metadata_list)
for metadatum in metadata_list:
print("METADATUM", metadatum)
# raise Exception(str(metadatum)
# + " " + str(type(metadatum)) + " " + str(len(metadatum)) + str(metadata_list)
# + " " + str(type(metadata_list)) + " " + str(len(metadata_list)) + str(metadata_lists))
Expand Down Expand Up @@ -802,7 +799,8 @@ def use_doc_labeled_terms_as_metadata(self, doc_labels, separator='_', replace_m
ordered_doc_labels = list(sorted(set(doc_labels)))
X = self._X
if replace_metadata:
X = self._mX
#X = self._mX
X = self._X

for doc_label in ordered_doc_labels:
label_doc_mask = doc_labels == doc_label
Expand Down
4 changes: 2 additions & 2 deletions scattertext/TermDocMatrixFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self,
the new string.
post_nlp_clean_function : function (default lambda x: x)
A function that takes a spaCy Doc
nlp : spacy.load('en') (default None)
nlp : spacy.load('en_core_web_sm') (default None)
The spaCy parser used to parse documents. If it's None,
the class will go through the expensive operation of
creating one to parse the text
Expand Down Expand Up @@ -143,7 +143,7 @@ def get_nlp(self):
nlp = self._nlp
if nlp is None:
import spacy
nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
return nlp

def censor_entity_types(self, entity_types):
Expand Down
6 changes: 3 additions & 3 deletions scattertext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import print_function

version = [0, 1, 1]
version = [0, 1, 2]
__version__ = '.'.join([str(e) for e in version])

import re
Expand Down Expand Up @@ -928,7 +928,7 @@ def word_similarity_explorer(corpus,
target_term : str
Word or phrase for semantic similarity comparison
nlp : spaCy-like parsing function
E.g., spacy.load('en'), whitespace_nlp, etc...
E.g., spacy.load('en_core_web_sm'), whitespace_nlp, etc...
alpha : float, default = 0.01
Uniform dirichlet prior for p-value calculation
max_p_val : float, default = 0.1
Expand All @@ -941,7 +941,7 @@ def word_similarity_explorer(corpus,

if nlp is None:
import spacy
nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')

base_term = nlp(target_term)
scores = np.array([base_term.similarity(nlp(tok))
Expand Down
6 changes: 3 additions & 3 deletions scattertext/external/phrasemachine/phrasemachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,14 +254,14 @@ def get_stdeng_spacy_tagger(suppress_errors=False):
try:
import spacy
SPACY_WRAPPER = SpacyTagger()
SPACY_WRAPPER.spacy_object = spacy.load('en', parser=False, entity=False)
SPACY_WRAPPER.spacy_object = spacy.load('en_core_web_sm', parser=False, entity=False)
return SPACY_WRAPPER
except ImportError:
if not suppress_errors: raise
except RuntimeError:
## this seems to happen if the 'en' model is not installed. it might
## this seems to happen if the 'en_core_web_sm' model is not installed. it might
## look like this:
# RuntimeError: Model 'en' not installed. Please run 'python -m spacy.en.download' to install latest compatible model.
# RuntimeError: Model 'en_core_web_sm' not installed. Please run 'python -m spacy.en.download' to install latest compatible model.
if not suppress_errors: raise
return None

Expand Down
29 changes: 19 additions & 10 deletions scattertext/features/PyTextRankPhrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,26 @@ def set_rank_smoothing_constant(self, rank_smoothing_constant):
return self

def get_doc_metadata(self, doc):
import pytextrank
phrase_counter = Counter()
tr = pytextrank.TextRank()
tr.doc = doc
phrases = tr.calc_textrank()
for phrase in phrases:
if self._include_chunks:
for chunk in phrase.chunks:
phrase_counter[str(chunk)] += (phrase.rank + self._rank_smoothing_constant)
else:
phrase_counter[phrase.text] += phrase.count * (phrase.rank + self._rank_smoothing_constant)
try:
for phrase in doc._.phrases:
if self._include_chunks:
for chunk in phrase.chunks:
phrase_counter[str(chunk)] += (phrase.rank + self._rank_smoothing_constant)
else:
phrase_counter[phrase.text] += phrase.count * (phrase.rank + self._rank_smoothing_constant)
except: # Support for pytextrank<3
import pytextrank
tr = pytextrank.TextRank()
tr.doc = doc
phrases = tr.calc_textrank()
for phrase in phrases:
if self._include_chunks:
for chunk in phrase.chunks:
phrase_counter[str(chunk)] += (phrase.rank + self._rank_smoothing_constant)
else:
phrase_counter[phrase.text] += phrase.count * (phrase.rank + self._rank_smoothing_constant)

return phrase_counter

def get_feats(self, doc):
Expand Down
66 changes: 66 additions & 0 deletions scattertext/tokenizers/roberta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from scattertext.WhitespaceNLP import Tok, _get_pos_tag, Doc


class RobertaTokenizerWrapper:
'''
Encapsulates the roberta tokenizer
'''
def __init__(self,
tokenizer,
decoder=None,
entity_type=None,
tag_type=None):
self.tokenizer = tokenizer
if decoder is None:
try:
from text_unidecode import unidecode
except:
raise Exception("Please install the text_unicode package to preprocess documents. "
"If you'd like to bypass this step, pass a text preprocessing function into "
"the decode parameter of this class.")
self.decoder = unidecode
else:
self.decoder = decoder
self.entity_type = entity_type
self.tag_type = tag_type

def tokenize(self, doc):
'''
doc: str, text to be tokenized
'''

sents = []
decoded_text = self.decoder(doc)
tokens = self.tokenizer.convert_ids_to_tokens(
self.tokenizer(decoded_text)['input_ids'],
skip_special_tokens=True
)

last_idx = 0
toks = []
for raw_token in tokens:
token_surface_string = raw_token
if ord(raw_token[0]) == 288:
token_surface_string = raw_token[1:]
if ord(raw_token[0]) == 266: # skip new lines
last_idx += len(raw_token)
continue
token_idx = decoded_text.index(token_surface_string, last_idx)
toks.append(Tok(_get_pos_tag(token_surface_string),
token_surface_string.lower(),
raw_token.lower(),
ent_type='' if self.entity_type is None else self.entity_type.get(token_surface_string, ''),
tag='' if self.tag_type is None else self.tag_type.get(token_surface_string, ''),
idx=token_idx))
last_idx = token_idx + len(token_surface_string)
if token_surface_string in ['.', '!', '?']: # idiot's sentence splitter
sents.append(toks)
toks = []

if len(toks) > 0:
sents.append(toks)
return Doc(sents, decoded_text)


def get_subword_encoding_name(self):
return 'RoBERTa'
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

setup(name='scattertext',
version='0.1.1',
version='0.1.2',
description='An NLP package to visualize interesting terms in text.',
url='https://github.com/JasonKessler/scattertext',
author='Jason Kessler',
Expand All @@ -17,7 +17,7 @@
'mock',
'statsmodels',
'flashtext',
'pytextrank==2.1.0'
#'pytextrank'
#'spacy',
#'jieba',
#'tinysegmenter',
Expand Down
2 changes: 1 addition & 1 deletion simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from scattertext import produce_scattertext_html
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
Expand Down

0 comments on commit 8ddff82

Please sign in to comment.