Skip to content
This repository has been archived by the owner on Oct 21, 2024. It is now read-only.

Doc: Add API Doc (#12) #23

Merged
merged 28 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2235f2d
docs: change folder name
41ow1ives Mar 5, 2024
a652e1d
refactor: edit precommit foramt
41ow1ives Mar 5, 2024
15584de
docs: modify docstring, add license in cleaning (#12)
41ow1ives Mar 5, 2024
0cb4b9a
docs: unify accent.py into char.py (#12)
41ow1ives Mar 5, 2024
eb30eb9
add ignore E203, E501, W503 to flake8
41ow1ives Mar 5, 2024
db2c0e6
docs: modify docstring, add license, typo in cleaning (#12)
41ow1ives Mar 5, 2024
968a20d
docs: modify docstring, add license, typo in cleaning (#12)
41ow1ives Mar 5, 2024
80e5e21
docs: modify docstring, add license, typo in data_ingestion (#12)
41ow1ives Mar 5, 2024
f97d6e2
docs: modify docstring, add license, typo in data_load (#12)
41ow1ives Mar 5, 2024
4bb08eb
docs: modify docstring, add license, typo in deduplication (#12)
41ow1ives Mar 5, 2024
c629157
docs: modify docstring, add license, typo in pii (#12)
41ow1ives Mar 5, 2024
0d72c76
docs: modify docstring, add license, typo in quality (#12)
41ow1ives Mar 5, 2024
9a5f1a0
docs: modify docstring, add license, typo in utils (#12)
41ow1ives Mar 5, 2024
c750aaa
docs: modify docstring, add license, typo in pipeline (#12)
41ow1ives Mar 5, 2024
5c2b749
docs: modify docstring, add license, typo in registry (#12)
41ow1ives Mar 5, 2024
16e1a22
add: add pass logic and etl_cls attributes (#12)
41ow1ives Mar 5, 2024
c90e29c
add: initial commit for sphinx (#12)
41ow1ives Mar 5, 2024
af2c3a7
add: add extensions, theme setting (#12)
41ow1ives Mar 5, 2024
4cf8b26
add: add setup to find docstring and pass non-ETL functions (#12)
41ow1ives Mar 5, 2024
e08e547
docs: notify not implemented (#12)
41ow1ives Mar 5, 2024
684da62
refactor: add @wraps for sphinx (#12)
41ow1ives Mar 5, 2024
904a0f8
docs: revise import(#12)
41ow1ives Mar 5, 2024
d17119a
add: add theme for sphinx (#12)
41ow1ives Mar 5, 2024
8c4b33e
add: add temp environment while building docs (#12)
41ow1ives Mar 5, 2024
2775e48
add: initial commit for documentation (#12)
41ow1ives Mar 5, 2024
7f9d669
docs: description about ETL
41ow1ives Mar 5, 2024
03e656c
docs: typo (#2)
41ow1ives Mar 5, 2024
0eb7ce3
docs: fix typo, add API Reference link (#2)
41ow1ives Mar 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
docs: unify accent.py into char.py (#12)
  • Loading branch information
41ow1ives committed Mar 5, 2024
commit 0cb4b9a345f426921c2503f7e92155519cdf9e96
54 changes: 0 additions & 54 deletions dataverse/etl/cleaning/accent.py

This file was deleted.

48 changes: 48 additions & 0 deletions dataverse/etl/cleaning/char.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""

import re
import unicodedata
from typing import Union

from pyspark.rdd import RDD
Expand Down Expand Up @@ -82,3 +83,50 @@ def _remove_non_printable_char(row):
data = data.map(_remove_non_printable_char)

return data


def strip_accents(text: str) -> str:
"""Strips accents from a piece of text."""
nfd = unicodedata.normalize("NFD", text)
output = [c for c in nfd if unicodedata.category(c) != "Mn"]
if len(output) == text:
return text
return "".join(output)


@register_etl
def cleaning___char___remove_accent(
spark, data: Union[RDD, DataFrame], subset: str = "text", *args, **kwargs
) -> RDD:
"""Strips accents from a piece of text.

+--------+--------+
| input | output |
+========+========+
| café | cafe |
| résumé | resume |
+--------+--------+

Code is from facebookresearch/cc_net
https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py

Args:
spark (SparkSession): The Spark session object.
data (Union[RDD, DataFrame]): The input data to be processed.
subset (str): A subset or column to consider. Defaults to 'text'.

Returns:
The processed data with accents removed.

"""
if isinstance(data, DataFrame):
data = data.rdd
data = data.map(lambda row: row.asDict())

def _strip_accents(row):
row[subset] = strip_accents(row[subset])
return row

data = data.map(_strip_accents)

return data