forked from wesm/pydata-book
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.py
80 lines (61 loc) · 2.36 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from pandas import *
from pandas.util.decorators import cache_readonly
import numpy as np
import os
base = 'ml-100k'
class IndexedFrame(object):
"""
"""
def __init__(self, frame, field):
self.frame = frame
def _build_index(self):
pass
class Movielens(object):
def __init__(self, base='ml-100k'):
self.base = base
@cache_readonly
def data(self):
names = ['user_id', 'item_id', 'rating', 'timestamp']
path = os.path.join(self.base, 'u.data')
return read_table(path, header=None, names=names)
@cache_readonly
def users(self):
names = ['user_id', 'age', 'gender', 'occupation', 'zip']
path = os.path.join(self.base, 'u.user')
return read_table(path, sep='|', header=None, names=names)
@cache_readonly
def items(self):
names = ['item_id', 'title', 'release_date', 'video_date',
'url', 'unknown', 'Action', 'Adventure', 'Animation',
"Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
path = os.path.join(self.base, 'u.item')
return read_table(path, sep='|', header=None, names=names)
@cache_readonly
def genres(self):
names = ['name', 'id']
path = os.path.join(self.base, 'u.genre')
data = read_table(path, sep='|', header=None, names=names)[:-1]
return Series(data.name, data.id)
@cache_readonly
def joined(self):
merged = merge(self.data, self.users)
merged = merge(merged, self.items)
return merged
def movie_stats(self, title):
data = self.joined[self.joined.title == title]
return data.groupby('gender').rating.mean()
def biggest_gender_discrep(data):
nobs = data.pivot_table('rating', rows='title',
cols='gender', aggfunc=len, fill_value=0)
mask = (nobs.values > 10).all(1)
titles = nobs.index[mask]
mean_ratings = data.pivot_table('rating', rows='title',
cols='gender', aggfunc='mean')
mean_ratings = mean_ratings.ix[titles]
diff = mean_ratings.M - mean_ratings.F
return diff[np.abs(diff).argsort()[::-1]]
buckets = [0, 18, 25, 35, 50, 80]
ml = Movielens()
title = 'Cable Guy, The (1996)'