Lossy Text Compression

We simply pick the shortest alternative word from a thesaurus. In order
to compress text in a lossy fashion.

Alice in wonderland compresses from 164K to 157K (and still just about being readable)!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/python3
 
import textwrap
from nltk.corpus import wordnet as wn
import nltk
import sys
 
 
def compressWord(word):
    leng = len(word)
    sword = word
 
    for i, syn in enumerate(wn.synsets(word)):
        syns = [n.name().replace('_', ' ') for n in syn.lemmas()]
 
        if not syns[0] == word:
            continue
 
        for s in syns:
            if len(s) < leng:
                sword = s
                leng = len(sword)
    return sword
 
def compressFile(filename):
    out = open(filename).read()
    output = ""
 
    words = nltk.tokenize.RegexpTokenizer("(?:[A-Z][.])+|\d[\d,.:\-/\d]*\d|\w+[\w\-\'.&|@:/]*\w+|\s|.|,|'|\"", False).tokenize(out)
    for w in words:
        c = compressWord(w)
        if c == None:
            output += w
        else:
            output += c
 
    return (output)
 
print (compressFile("pg11.txt"))

Latest Source code


Leave Comment

Error Please check your entries!