-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathclassify_names.py
152 lines (111 loc) · 4.67 KB
/
classify_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import argparse
import os
import pandas as pd
from lstm_classifier import LSTMClassifier
# define input functions in a dictionary
input_fns = {
"excel": pd.read_excel,
"csv": pd.read_csv,
"dta": pd.read_stata
}
def check_data_type(fp):
# read in excel files
if (".xlsx" in fp) | (".xls" in fp):
return "excel"
# read in csv files
elif (".csv" in fp):
return "csv"
# read in stata files
elif (".dta" in fp):
return "dta"
else:
raise ValueError("Input data must be excel, csv, or dta format")
def classify_gender(mode, model_fp="", data_fn="", output_fp=""):
"""
Wrapper function to call the LSTMClassifier object to either train
a model or use an existing model to classify data.
mode = train, classify
model_fp = Full path to where model is stored (specify in classify mode)
data_fn = File name, including full path, for training data (in train mode)
or unlabelled data (in classify mode) ("railway_names_clean_full.dta")
output_fp = filepath where data is written out to
"""
# expand the home directory symbol in any filepaths
model_fp = os.path.expanduser(model_fp)
data_fn = os.path.expanduser(data_fn)
output_fp = os.path.expanduser(output_fp)
# --------------- #
# TRAIN THE MODEL #
# --------------- #
if mode == "train":
# initiate name classifier object
model = LSTMClassifier()
# find the input function based on the file type
read_in = input_fns[check_data_type(data_fn)]
# read in the data
df = read_in(data_fn)
# make sure to only keep names with M or F as values for gender
df = df[df['gender'].isin(["M", "F"])]
# 1) CLEAN
df = model.clean_string(df, stringvar="full_name")
# 2) SPLIT
X_train_df, y_train, X_test_df, y_test = model.train_test_split(df, "full_name_clean", "gender")
# 3) ENCODE
X_train, top_chars = model.encode_x(X_train_df, "full_name_clean")
X_test, top_chars = model.encode_x(X_test_df, "full_name_clean")
# store all data in a dictionary
training_data = {"X_train_df": X_train_df,
"X_train": X_train,
"X_test_df": X_test_df,
"X_test": X_test,
"y_train": y_train,
"y_test": y_test}
# save testing and training datasets
pd.to_pickle(training_data, os.path.join(output_fp, "training_data.pkl"))
# 4) TRAIN
model_fp = model.train_model(X_train, y_train, X_test, y_test)
return model_fp
# ----------------------- #
# CLASSIFY SAMPLE NAMES #
# ----------------------- #
if mode == "classify":
# check if the model exists
if not os.path.exists(model_fp):
raise FileNotFoundError(f"{model_fp} file cannot be found")
# load the model
model = LSTMClassifier(model_fp=model_fp, load=True, epochs=5)
print(f"Classifying names...")
# create dictionary to hold all of the dataframe chunks
dfs = {}
num = 0
# find the input function based on the file type
read_in = input_fns[check_data_type(data_fn)]
# read in the data in chunks as these are large files
for df_chunk in read_in(data_fn, chunksize=1000000):
# replace and missing names with empty strings
df_chunk["name"] = df_chunk["name"].fillna("")
# 1) CLEAN
df_chunk = model.clean_string(df_chunk, "name")
# 2) ENCODE
X_data, top_chars = model.encode_x(df_chunk, "name_clean")
# 3) CLASSIFY
y_pred = model.predict_classes(X_data)
# muslim prediction stored in the first column, non-muslim in the second
df_chunk['female'] = y_pred[:, 0]
df_chunk['male'] = y_pred[:, 1]
# store the dataframe chunk in the dictionary
dfs[num] = df_chunk
del df_chunk, X_data
# iterate num, which is the key to storing new dataframes in dfs
num += 1
# append all the dataframe chunks in the dictionary into one dataframe
df = pd.concat(dfs.values(), ignore_index=True)
# write the dataframe out to file
if not output_fp:
output_fn = "names_female_class_sample.csv"
else:
output_fn = os.path.join(output_fp, "names_female_class_sample.csv")
# write out the results
df.to_csv(output_fn, index=False)
print("Classification complete.")
del dfs