forked from csev/py4e
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gword.py
55 lines (48 loc) · 1.42 KB
/
gword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sqlite3
import time
import urllib
import zlib
import string
conn = sqlite3.connect('index.sqlite')
conn.text_factory = str
cur = conn.cursor()
cur.execute('''SELECT subject_id,subject FROM Messages
JOIN Subjects ON Messages.subject_id = Subjects.id''')
counts = dict()
for message_row in cur :
text = message_row[1]
text = text.translate(None, string.punctuation)
text = text.translate(None, '1234567890')
text = text.strip()
text = text.lower()
words = text.split()
for word in words:
if len(word) < 4 : continue
counts[word] = counts.get(word,0) + 1
# Find the top 100 words
words = sorted(counts, key=counts.get, reverse=True)
highest = None
lowest = None
for w in words[:100]:
if highest is None or highest < counts[w] :
highest = counts[w]
if lowest is None or lowest > counts[w] :
lowest = counts[w]
print 'Range of counts:',highest,lowest
# Spread the font sizes across 20-100 based on the count
bigsize = 80
smallsize = 20
fhand = open('gword.js','w')
fhand.write("gword = [")
first = True
for k in words[:100]:
if not first : fhand.write( ",\n")
first = False
size = counts[k]
size = (size - lowest) / float(highest - lowest)
size = int((size * bigsize) + smallsize)
fhand.write("{text: '"+k+"', size: "+str(size)+"}")
fhand.write( "\n];\n")
fhand.close()
print "Output written to gword.js"
print "Open gword.htm in a browser to view"