update preprocess scripts

HongHanh2104 · Oct 10, 2020 · 515d952 · 515d952
1 parent 5d1f922
commit 515d952
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 6 deletions.
diff --git a/pre-processing.py b/pre-processing.py
@@ -172,5 +172,6 @@ def preprocess_with_stopword(data):
 cleaned_text = preprocess_with_stopword(tweets['Text'])
 
 tweets['Cleaned_Text'] = cleaned_text
+tweets = tweets[['ID', 'Label', 'Text', 'hashtag', 'Cleaned_Text']]
 tweets.to_csv(f'{root_path}/cleaned_{filename}', index=False)
 print('Complete pre-precessing data!'.upper())
diff --git a/split_csv.py b/split_csv.py
@@ -3,6 +3,8 @@
 import argparse
 import random
 
+print('Make sure your CSV has the first column for ID and the second column for class label')
+
 TRAIN = 'train_train'
 VAL = 'train_val'
 
@@ -23,12 +25,12 @@
 
 # Load CSV
 df = pd.read_csv(args.csv)
-data = df[['ID', 'Label', 'Text', 'hashtag', 'Cleaned_Text']].values
+data = df.values
 
 d = dict()
-for id_str, lb, txt, hashtag, clean_txt in data:
+for id_str, lb, *metadata in data:
     d.setdefault(lb, [])
-    d[lb].append((id_str, txt, hashtag, clean_txt))
+    d[lb].append((id_str, *metadata))
 
 splits = {
     TRAIN: dict(),
@@ -43,11 +45,11 @@
 
 # Split
 for split, labels in splits.items():
-    out = [['ID', 'Label', 'Text', 'hashtag', 'Cleaned_Text']]
+    out = [list(df.keys())]
     out.extend([
-        [id_str, lb, txt, hashtag, cleaned_txt]
+        [id_str, lb, *metadata]
         for lb, values in labels.items()
-        for id_str, txt, hashtag, cleaned_txt in values
+        for id_str, *metadata in values
     ])
     csv.writer(open(f'{args.out}/{split}.csv', 'w')).writerows(out)