forked from AllenDowney/ThinkComplexity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathZipf1.py
78 lines (62 loc) · 2.24 KB
/
Zipf1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import sys
import string
from math import *
class Hist(dict):
"""a histogram is a dictionary that maps from each item (x) to the
number of times the item has appeared (frequency, f)
"""
def __init__(self, seq=[]):
"create a new histogram starting with the items in seq"
for x in seq:
self.count(x)
def count(self, x):
"increment the counter associated with item x"
self[x] = self.get(x, 0) + 1
def pdf(self):
"""return a list of tuples where each tuple is a value x
and a frequency f.
"""
return []
def cdf(self):
"""return a list of tuples where each tuple is a value x
and the cumulative fraction of values less than or equal
to x. This is the empirical CDF of the values in the Hist.
Note: the cdf makes more sense if the data values in the Hist
are numeric, but this function works for any data type that
can be sorted.
"""
return []
def rank_freq(self):
"""return a list of tuples where each tuple is a rank
and the number of times the item with that rank appeared.
"""
return []
class Zipf(Hist):
"""Zipf is a histogram that maps from words to frequencies.
It provides methods to print data for a Zipf plot (frequency
versus rank) and a complementary CDF (percentile versus value),
both on log-log axes.
"""
def process_file(self, filename):
fp = open(filename, 'r')
for line in fp:
line = line.replace('--', ' ')
line = line.replace("'s ", ' ')
for word in line.rstrip().split():
self.process_word(word)
def process_word(self, word):
word = word.strip(string.punctuation)
self.count(word)
def print_ranks(self):
"""print the data for a bar chart in which the x-axis
shows ranks in increasing order and the y-axis shows the
frequency of the value with the given rank.
"""
for r, f in self.rank_freq():
print log10(r), log10(f)
def main(name, filename='', flag=None, *args):
z = Zipf()
z.process_file(filename)
z.print_ranks()
if __name__ == '__main__':
main(*sys.argv)