Skip to content

Commit

Permalink
update preprocess scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
vltanh committed Oct 10, 2020
1 parent 5d1f922 commit 515d952
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 6 deletions.
1 change: 1 addition & 0 deletions pre-processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,5 +172,6 @@ def preprocess_with_stopword(data):
cleaned_text = preprocess_with_stopword(tweets['Text'])

tweets['Cleaned_Text'] = cleaned_text
tweets = tweets[['ID', 'Label', 'Text', 'hashtag', 'Cleaned_Text']]
tweets.to_csv(f'{root_path}/cleaned_{filename}', index=False)
print('Complete pre-precessing data!'.upper())
14 changes: 8 additions & 6 deletions split_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import argparse
import random

print('Make sure your CSV has the first column for ID and the second column for class label')

TRAIN = 'train_train'
VAL = 'train_val'

Expand All @@ -23,12 +25,12 @@

# Load CSV
df = pd.read_csv(args.csv)
data = df[['ID', 'Label', 'Text', 'hashtag', 'Cleaned_Text']].values
data = df.values

d = dict()
for id_str, lb, txt, hashtag, clean_txt in data:
for id_str, lb, *metadata in data:
d.setdefault(lb, [])
d[lb].append((id_str, txt, hashtag, clean_txt))
d[lb].append((id_str, *metadata))

splits = {
TRAIN: dict(),
Expand All @@ -43,11 +45,11 @@

# Split
for split, labels in splits.items():
out = [['ID', 'Label', 'Text', 'hashtag', 'Cleaned_Text']]
out = [list(df.keys())]
out.extend([
[id_str, lb, txt, hashtag, cleaned_txt]
[id_str, lb, *metadata]
for lb, values in labels.items()
for id_str, txt, hashtag, cleaned_txt in values
for id_str, *metadata in values
])
csv.writer(open(f'{args.out}/{split}.csv', 'w')).writerows(out)

0 comments on commit 515d952

Please sign in to comment.