-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgender_id.py
27 lines (22 loc) · 1020 Bytes
/
gender_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import random
import nltk
from nltk.corpus import names
from nltk.corpus import movie_reviews
def gender_features(word):
return {'last_letter': word[-1]}
def gender_features2(name):
features = {}
features["first_letter"] = name[0].lower()
features["last_letter"] = name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
features["count({})".format(letter)] = name.lower().count(letter)
features["has({})".format(letter)] = (letter in name.lower())
return features
#cargar data y poner en el formato ('texto','tag')
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)
#transformar a formato ([caracteristicas],'tag')
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))