forked from AllenDowney/ThinkPython
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalyze_book3.py
62 lines (44 loc) · 1.53 KB
/
analyze_book3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""This module contains code from
Think Python by Allen B. Downey
http://thinkpython.com
Copyright 2012 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
import string
import random
from bisect import bisect
from analyze_book import *
def random_word(hist):
"""Chooses a random word from a histogram.
The probability of each word is proportional to its frequency.
This could be made faster by computing the cumulative frequencies
once and reusing them.
"""
words = []
freqs = []
total_freq = 0
# make a list of words and a list of cumulative frequencies
for word, freq in hist.items():
total_freq += freq
words.append(word)
freqs.append(total_freq)
# choose a random value and find its location in the cumulative list
x = random.randint(0, total_freq-1)
index = bisect(freqs, x)
return words[index]
if __name__ == '__main__':
hist = process_file('emma.txt', skip_header=True)
print 'Total number of words:', total_words(hist)
print 'Number of different words:', different_words(hist)
t = most_common(hist)
print 'The most common words are:'
for freq, word in t[0:20]:
print word, '\t', freq
words = process_file('words.txt', skip_header=False)
diff = subtract(hist, words)
print "The words in the book that aren't in the word list are:"
for word in diff:
print word,
print "\n\nHere are some random words from the book"
for i in range(100):
print random_word(hist),