Skip to content

Commit 7522657

Browse files
author
aashumallik
committed
A lossless file compression engine built in python
1 parent f6575c2 commit 7522657

File tree

1 file changed

+83
-0
lines changed

1 file changed

+83
-0
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import huffman
2+
import bitstring
3+
from bitstring import BitArray
4+
5+
myfile = open("foo.txt","r")
6+
allofthefile = myfile.read()
7+
myfile.close()
8+
9+
mycharset = u"\u000A"
10+
mycharset = mycharset + " abcdefghijklmnopqrstuvwxyz"+\
11+
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"+\
12+
"0123456789!#$%&'()\"*+,-./:;<=>?@[\]^_`{|}~"+\
13+
"àæçèéêôëü"
14+
countset = [0 for i in range (0,len(mycharset))]
15+
16+
for i in range (0,len(allofthefile)):
17+
singlechar = allofthefile[i]
18+
for j in range (0,len(mycharset)):
19+
if mycharset[j] == singlechar:
20+
countset[j] = countset[j]+1 # count the apperance of the charecters
21+
break
22+
23+
totalcount = 0
24+
for i in range (0,len(countset)):
25+
totalcount = totalcount + countset[i] # count how many charecters text contains
26+
27+
probabilityset = [0 for i in range (0,len(mycharset))]
28+
29+
for i in range (0,len(countset)):
30+
probabilityset[i] = countset[i]/totalcount # calculate appearance probability of the charecters
31+
32+
mydict = {}
33+
34+
for i in range (0, len(mycharset)):
35+
if countset[i] != 0: # precaution to dont create a Huffman code for zero elements
36+
mydict[str(mycharset[i])] = probabilityset[i]
37+
mycodebook = huffman.codebook(mydict.items())
38+
39+
40+
for i in range (0,len(mycharset)):
41+
if countset[i] != 0: # suppress the zero appearance charecters
42+
print(mycharset[i] , " has " , '{0:04d}'.format(countset[i]) , " times appeared. "+\
43+
"Probability = " , '{:.10f}'.format(probabilityset[i]) + " Huffman: " + mycodebook[str(mycharset[i])]) # just a print out operation
44+
45+
onesandzeros = ""
46+
for i in range (0, len(allofthefile)):
47+
onesandzeros = onesandzeros + mycodebook[str(allofthefile[i])]
48+
49+
binary_file = open('compressed_foo.bin', 'wb')
50+
51+
i = 0
52+
while (i < len(onesandzeros)):
53+
b = BitArray(bin=onesandzeros[i:i+8]) # divide array with 8 many bits and make them into a byte
54+
b.tofile(binary_file)
55+
i = i+8
56+
57+
binary_file.close()
58+
59+
binary_file = open('compressed_foo.bin', "rb")
60+
allofthebinaryfile = binary_file.read()
61+
binary_file.close()
62+
63+
newonesandzeros = ""
64+
65+
for i in range (0, len(allofthebinaryfile)):
66+
newonesandzeros = newonesandzeros + str(bin(allofthebinaryfile[i])[2:].zfill(8)) # tranform bytes into bit array
67+
68+
mynewfile = ""
69+
i=0
70+
while (i < len(newonesandzeros)):
71+
for j in range (0, len(list(mycodebook.values()))):
72+
check = list(mycodebook.values())[j]
73+
if (newonesandzeros[i:i+len(check)] == check): # check the Binary Huffman sequence in the bit array
74+
mynewfile = mynewfile + list(mycodebook.keys())[j] # if the sequence is found, transform it into the character and add it to the character array
75+
i = i + len(check)
76+
break
77+
78+
mynewfile = mynewfile[:-1]
79+
80+
newfile = open("foonew.txt","w")
81+
newfile.write(mynewfile)
82+
newfile.close()
83+

0 commit comments

Comments
 (0)