Skip to content

Commit

Permalink
Add HTML tags removing to default preprocessing pipeline (jbesomi#192)
Browse files Browse the repository at this point in the history
* Add HTML tags removing to default preprocessing pipeline

* Update docstring and add more tests to default cleaning pipeline

* Update `get_default_pipeline` docstring
  • Loading branch information
hugoabonizio authored Oct 21, 2020
1 parent a1c03b3 commit c408c67
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 10 deletions.
21 changes: 19 additions & 2 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,23 @@ def test_pipeline_stopwords(self):
pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)

def test_pipeline_default(self):
s = pd.Series(
[
"Amazon! < br />< br /> If I was going to order any soft drink online, it would be Diet Coke with Lime",
pd.NA,
"-1234. Mère, Françoise, noël",
]
)
s_true = pd.Series(
[
"amazon going order soft drink online would diet coke lime",
"",
"mere francoise noel",
]
)
self.assertEqual(preprocessing.clean(s), s_true)

"""
Test stopwords.
"""
Expand Down Expand Up @@ -151,8 +168,8 @@ def test_stopwords_are_set(self):
"""

def test_remove_html_tags(self):
s = pd.Series("<html>remove <br>html</br> tags<html> &nbsp;")
s_true = pd.Series("remove html tags ")
s = pd.Series("<html>remove <br>html</br> tags<html> &nbsp; < br />< br />")
s_true = pd.Series("remove html tags ")
self.assertEqual(preprocessing.remove_html_tags(s), s_true)

"""
Expand Down
19 changes: 11 additions & 8 deletions texthero/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,15 +389,17 @@ def get_default_pipeline() -> List[Callable[[pd.Series], pd.Series]]:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
3. :meth:`texthero.preprocessing.remove_digits`
4. :meth:`texthero.preprocessing.remove_punctuation`
5. :meth:`texthero.preprocessing.remove_diacritics`
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`
4. :meth:`texthero.preprocessing.remove_html_tags`
5. :meth:`texthero.preprocessing.remove_punctuation`
6. :meth:`texthero.preprocessing.remove_diacritics`
7. :meth:`texthero.preprocessing.remove_stopwords`
8. :meth:`texthero.preprocessing.remove_whitespace`
"""
return [
fillna,
lowercase,
remove_digits,
remove_html_tags,
remove_punctuation,
remove_diacritics,
remove_stopwords,
Expand All @@ -415,10 +417,11 @@ def clean(s: TextSeries, pipeline=None) -> TextSeries:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
3. :meth:`texthero.preprocessing.remove_digits`
4. :meth:`texthero.preprocessing.remove_punctuation`
5. :meth:`texthero.preprocessing.remove_diacritics`
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`
4. :meth:`texthero.preprocessing.remove_html_tags`
5. :meth:`texthero.preprocessing.remove_punctuation`
6. :meth:`texthero.preprocessing.remove_diacritics`
7. :meth:`texthero.preprocessing.remove_stopwords`
8. :meth:`texthero.preprocessing.remove_whitespace`
Parameters
----------
Expand Down

0 comments on commit c408c67

Please sign in to comment.