-
Notifications
You must be signed in to change notification settings - Fork 32
/
classifiation.py
76 lines (58 loc) · 2.21 KB
/
classifiation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
from feature_extraction import feature_extraction
# Creating a lookup table so that related feature maps to its coresponding main feature
def construct_rev_lookup(features):
bucket_lookup = {}
for bucket, similar_features in features.items():
bucket_lookup[bucket] = bucket
for feature in similar_features:
if feature not in bucket_lookup:
bucket_lookup[feature] = bucket
return bucket_lookup
def classify(df, features, model):
lookup = construct_rev_lookup(features)
invalid_pos = set(['PRON', 'AUX', 'DET'])
features = []
sentences = []
sentiments = []
no_cat_sents = []
more_than_one_sents = []
for review in df['spacyObj']:
for sent in review.sents:
# lets check if the sentence contains more than one noun/adjective
# If a sentence contains only pronouns, auxilary words or articles, then it is not considered
no_of_valid_tokens = 0
for token in sent:
if token.is_alpha and token.pos_ not in invalid_pos:
no_of_valid_tokens += 1
if no_of_valid_tokens > 1:
break
if no_of_valid_tokens < 2:
continue
cat = None
flag = True
# Here we check to which category/feature a particular sentence corresponds to
for token in sent:
if token.text in lookup:
if cat is None:
cat = lookup[token.text]
elif cat is not None and lookup[token.text] != cat:
flag = False
more_than_one_sents.append(sent.text)
break
# If a sentence doesn't have any feature then it is not considered as well
if cat == None:
flag = False
no_cat_sents.append(sent.text)
# Here the sentiment of the sentence is predicted with the logistic regression model
# that was loaded when the server was started
if flag:
# Now we know the sentence contains only one feature and find sentiment of that
pred = model.predict([sent.text])[0]
features.append(cat)
sentences.append(sent.text)
sentiments.append(pred)
results_df = pd.DataFrame({'category': features, 'sentence': sentences, 'sentiment': sentiments})
no_cat_df = pd.DataFrame({'sentence': no_cat_sents})
more_than_one_df = pd.DataFrame({'sentences': more_than_one_sents})
return results_df, more_than_one_df, no_cat_df