We simply pick the shortest alternative word from a thesaurus. In order
to compress text in a lossy fashion.
Alice in wonderland compresses from 164K to 157K (and still just about being readable)!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | #!/usr/bin/python3 import textwrap from nltk.corpus import wordnet as wn import nltk import sys def compressWord(word): leng = len (word) sword = word for i, syn in enumerate (wn.synsets(word)): syns = [n.name().replace( '_' , ' ' ) for n in syn.lemmas()] if not syns[ 0 ] = = word: continue for s in syns: if len (s) < leng: sword = s leng = len (sword) return sword def compressFile(filename): out = open (filename).read() output = "" words = nltk.tokenize.RegexpTokenizer( "(?:[A-Z][.])+|\d[\d,.:\-/\d]*\d|\w+[\w\-\'.&|@:/]*\w+|\s|.|,|'|\"" , False ).tokenize(out) for w in words: c = compressWord(w) if c = = None : output + = w else : output + = c return (output) print (compressFile( "pg11.txt" )) |
Leave Comment
Error