We simply pick the shortest alternative word from a thesaurus. In order
to compress text in a lossy fashion.
Alice in wonderland compresses from 164K to 157K (and still just about being readable)!
#!/usr/bin/python3 import textwrap from nltk.corpus import wordnet as wn import nltk import sys def compressWord(word): leng = len(word) sword = word for i, syn in enumerate(wn.synsets(word)): syns = [n.name().replace('_', ' ') for n in syn.lemmas()] if not syns[0] == word: continue for s in syns: if len(s) < leng: sword = s leng = len(sword) return sword def compressFile(filename): out = open(filename).read() output = "" words = nltk.tokenize.RegexpTokenizer("(?:[A-Z][.])+|\d[\d,.:\-/\d]*\d|\w+[\w\-\'.&|@:/]*\w+|\s|.|,|'|\"", False).tokenize(out) for w in words: c = compressWord(w) if c == None: output += w else: output += c return (output) print (compressFile("pg11.txt"))
Leave Comment