#
# SpellMunger
# Simple spell checker
# by Nick Rusnov <nick@rusnov.net> 2008
#

import random
import string
import re
import sys

RESULT_NOMATCH = 0
RESULT_GOOD = 1
RESULT_MATCHES = 2

_vowels = ['a','e','i','o','u']

# this is rather inefficient
def singlifyFilter(wordlist):
    """Return the set passed with every possible variation of doubled letters removed (recursively, so looop => loop, lop)"""
    for w in wordlist:
        prevl = None
        idx = 0
        doubles = list()
        possibles = set()
        for l in w:
            if prevl == l:
                doubles.append(idx)
            prevl = l
            idx += 1
        for d in doubles:
            possibles.add(w[0:d] + w[d+1:])
        
        if len(possibles):
            wordlist = wordlist.union(possibles)
            wordlist = wordlist.union(singlifyFilter(possibles))
    return wordlist

def vowelPermeate(word, idx):
    """Return a set of one particular index of the word replaced with all of the other values."""
    output = set()
    vlist = list(_vowels)
    del vlist[vlist.index(word[idx])]
    
    for v in vlist:
        output.add(word[0:idx] + v + word[idx+1:])
    
    return output
                     

def vowelifyFilter(wordlist):
    """Return a set with every possible variation of vowels replaced (eg loop => l[aeiou][aeiou]p) from the input set."""
    output = set()
    
    for word in wordlist:
        indexes = list()
        idx = 0
        # collect all of the vowel indexes
        for letter in word:
            if letter in _vowels:
                indexes.append(idx)
            idx += 1
        results = set()
        results.add(word)
        for vowel in indexes:
            for pword in list(results):
                results = results.union(vowelPermeate(pword, vowel))
        
        output = output.union(results)
    return output
        
def doublifyFilter(word):
    """TESTING PURPOSES: Return the word with a random letter doubled."""
    letteridx = random.randint(0, len(word)-1)
    return word[0:letteridx] + word[letteridx] + word[letteridx:]

def vowelmungeFilter(word):
    """TESTING PURPOSES: Return the word with a random vowel replaced with a different vowel."""
    vidx = list()
    idx = 0
    for letter in word:
        if letter in _vowels:
            vidx.append(idx)
        idx += 1        
    if vidx:
        idx = random.choice(vidx)
        return word[0:idx] + random.choice(_vowels) + word[idx+1:]
    return word

def capifyFilter(word):
    """TESTING PURPOSES: Return the word with some of the letters capitalized."""
    nr = random.randint(0, len(word) - 1)
    idxes = list(range(len(word)))
    for i in range(nr):
        idx = random.randint(0, len(idxes) - 1)
        word = word[0:idx] + word[idx].swapcase() + word[idx+1:]
        del idxes[idx]
    return word
        

def levenshtein(worda, wordb):
    """Word difference in number of edits. Based on the Wikipedia article."""
    m=len(worda)+1
    n=len(wordb)+1
    
    matrix = list()
    for i in range(m):
        line = list()
        for j in range(n):
            line.append(0)
        matrix.append(line)
            

    for i in range(m):
        matrix[i][0] = i

    for i in range(n):
        matrix[0][i] = i


    for i in range(1,m):
        for j in range(1, n):
            cost = 1
            if worda[i-1] == wordb[j-1]:
                cost = 0
            matrix[i][j] = min(matrix[i-1][j] + 1,
                               matrix[i][j-1] + 1,
                               matrix[i-1][j-1] + cost)
            
    return matrix[len(worda)][len(wordb)]


class SpellMunger:
    wordList = dict()
    masterWordList = list()
    
    def __init__(self, wordListFile):
        """Accepts an iteratible list of words (an open file, a list, etc).
           Filters out possessive forms. Cleans up white space."""
        aposs = re.compile(".*'s$")
        for line in wordListFile:
            # filter out the 's lines 
            line = line.strip()
            if aposs.match(line):
                continue
            
            # we're just using the dict as a hash list for matching.
            # We'll store the actual spelling at the target (cap-wise),
            # but store the lowercase as the key. I had a really clever idea
            # about collecting all of the possible capitalization patterns
            # and applying them to candidate words, but this is so incredibly
            # simpler anyway, we'll go with it, especially since a dict is
            # being used for matching anyhow.
            if self.wordList.has_key(line.lower()):
                self.wordList[line.lower()].append(line)
            else:
                self.wordList[line.lower()] = [line,]


        for i in self.wordList.values():
            for w in i:
                self.masterWordList.append(w)

    def Check(self, word):
        """return RESULT,list. RESULT should be checked against 
           RESULT_GOOD/RESULT_NOMATCH/RESULT_MATCHES. List is a 
           list of possible matches in the form of 
           (Levenshtein Distance, word) sorted by the Levenshtein 
           distance."""

        if word in self.masterWordList:
            return RESULT_GOOD,None

        # potentially we could check after each step if the new varients found
        # in the step are potentials. This would make it return faster if an
        # early variant is available, but it wouldn't produce a complete list
        # of possibles. Easy to make optional.
        checklist = set()
        checklist.add(word.lower())
        
        checklist = singlifyFilter(checklist)
        vowelified = vowelifyFilter(checklist)

        outlist = list()
        for ckword in checklist:
            if self.wordList.has_key(ckword):
                for i in self.wordList[ckword]:
                    outlist.append((levenshtein(word, i), i))
        # I have discovered that the Levenshtein distance gives unexpected results
        # for some words, well that it has problems with the upcase differences because
        # it counts them as an edit. So LURK => lark preferentially because U=>u and U=>a
        # give the same number of edits. I score the vowelified results one point higher
        # to account for this.
        # Also vowelifying creates a bazillion more words, especially if a word has a lot of
        # vowels in it. So this part is pretty slow.
        for ckword in vowelified:
            if self.wordList.has_key(ckword):
                for i in self.wordList[ckword]:
                    outlist.append((levenshtein(word, i)+1, i))
    
        result = RESULT_NOMATCH
        if outlist:
            result = RESULT_MATCHES
            outlist.sort(lambda i,j: cmp(i[0], j[0]))
        return result, outlist


if __name__ == '__main__':
    mymunge = SpellMunger(file('/usr/share/dict/words','r'))

    # do a certain number of tests and exit.
    testcount = 0
    if len(sys.argv) > 1:
        testcount = int(sys.argv[1])
    if testcount:
        for i in range(testcount):
            word = random.choice(mymunge.masterWordList)
            print "TEST: word = ", word

            if random.randint(0,100) > 30:
                word = capifyFilter(word)
            if random.randint(0,100) > 30:
                word = vowelmungeFilter(word)
            if random.randint(0,100) > 30:
                word = doublifyFilter(word)

            print "TEST: mungified word = ",word

            res, possibles = mymunge.Check(word) 
            if res == RESULT_GOOD:
                print "TEST: GOOD"
            elif res == RESULT_NOMATCH:
                print "TEST: NO SUGGESTION"
            else:
                print "TEST: SUGGESTIONS ",possibles
            print "TEST: **"

    else:
        print "Interactive Mode. EOF or blank line to exit."
        try:
            word = raw_input("> ")
            while (word):
                word = word.strip()
                
                res,possibles = mymunge.Check(word)

                if res == RESULT_GOOD:
                    print word
                elif res == RESULT_NOMATCH:
                    print "NO SUGGESTION"
                else:
                    print possibles[0][1]

                word = raw_input("> ")        

        except EOFError:
            pass
        



        
        

        

