new-paice.py

"""This module contains an object that implements the Paice-Husk stemming
algorithm.
If you just want to use the standard Paice-Husk stemming rules, use the
module's ``stem()`` function::
    stemmed_word = stem(word)
If you want to use a custom rule set, read the rules into a string where the
rules are separated by newlines, and instantiate the object with the string,
then use the object's stem method to stem words::
    stemmer = PaiceHuskStemmer(my_rules_string)
    stemmed_word = stemmer.stem(word)
"""

import re
from collections import defaultdict
from nltk import word_tokenize
from nltk.stem.lancaster import LancasterStemmer

class PaiceHuskStemmer(object):
    """Implements the Paice-Husk stemming algorithm.
    """

    rule_expr = re.compile(r"""
    ^(?P<ending>\w+)
    (?P<intact>[*]?)
    (?P<num>\d+)
    (?P<append>\w*)
    (?P<cont>[.>])
    """, re.UNICODE | re.VERBOSE)

    stem_expr = re.compile("^\w+", re.UNICODE)

    def __init__(self, ruletable):
        """
        :param ruletable: a string containing the rule data, separated
            by newlines.
        """
        self.rules = defaultdict(list)
        self.read_rules(ruletable)

    def read_rules(self, ruletable):
        rule_expr = self.rule_expr
        rules = self.rules

        for line in ruletable.split("\n"):
            line = line.strip()
            if not line:
                continue

            match = rule_expr.match(line)
            if match:
                ending = match.group("ending")[::-1]
                lastchar = ending[-1]
                intact = match.group("intact") == "*"
                num = int(match.group("num"))
                append = match.group("append")
                cont = match.group("cont") == ">"

                rules[lastchar].append((ending, intact, num, append, cont))
            else:
                raise Exception("Bad rule: %r" % line)

    def first_vowel(self, word):
        vp = min([p for p in [word.find(v) for v in "aeiou"]
                  if p > -1])
        yp = word.find("y")
        if yp > 0 and yp < vp:
            return yp
        return vp

    def strip_prefix(self, word):
        for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
                       "nano", "pico", "pseudo"):
            if word.startswith(prefix):
                return word[len(prefix):]
        return word

    def stem(self, word):
        """Returns a stemmed version of the argument string.
        """

        rules = self.rules
        match = self.stem_expr.match(word)
        if not match: return word
        stem = self.strip_prefix(match.group(0))

        is_intact = True
        continuing = True
        while continuing:
            pfv = self.first_vowel(stem)
            print("isinya ",pfv)
            rulelist = rules.get(stem[-1])
            if not rulelist: break

            continuing = False
            for ending, intact, num, append, cont in rulelist:
                if stem.endswith(ending):
                    if intact and not is_intact: continue
                    newlen = len(stem) - num + len(append)

                    if ((pfv == 0 and newlen < 2)
                            or (pfv > 0 and newlen < 3)):
                        # If word starts with vowel, minimum stem length is 2.
                        # If word starts with consonant, minimum stem length is
                        # 3.
                        continue

                    is_intact = False
                    stem = stem[:0 - num] + append

                    continuing = cont
                    break

        return stem


# The default rules for the Paice-Husk stemming algorithm
my_rules=("gni3>")
defaultrules = """
ai*2.     { -ia > -   if intact }
a*1.      { -a > -    if intact }
bb1.      { -bb > -b   }
city3s.   { -ytic > -ys }
ci2>      { -ic > -    }
cn1t>     { -nc > -nt  }
dd1.      { -dd > -d   }
dei3y>    { -ied > -y  }
deec2ss.  { -ceed > -cess }
dee1.     { -eed > -ee }
de2>      { -ed > -    }
dooh4>    { -hood > -  }
e1>       { -e > -     }
feil1v.   { -lief > -liev }
fi2>      { -if > -    }
gni3>     { -ing > -   }
gai3y.    { -iag > -y  }
ga2>      { -ag > -    }
gg1.      { -gg > -g   }
ht*2.     { -th > -   if intact }
hsiug5ct. { -guish > -ct }
hsi3>     { -ish > -   }
i*1.      { -i > -    if intact }
i1y>      { -i > -y    }
ji1d.     { -ij > -id   --  see nois4j> & vis3j> }
juf1s.    { -fuj > -fus }
ju1d.     { -uj > -ud  }
jo1d.     { -oj > -od  }
jeh1r.    { -hej > -her }
jrev1t.   { -verj > -vert }
jsim2t.   { -misj > -mit }
jn1d.     { -nj > -nd  }
j1s.      { -j > -s    }
lbaifi6.  { -ifiabl > - }
lbai4y.   { -iabl > -y }
lba3>     { -abl > -   }
lbi3.     { -ibl > -   }
lib2l>    { -bil > -bl }
lc1.      { -cl > c    }
lufi4y.   { -iful > -y }
luf3>     { -ful > -   }
lu2.      { -ul > -    }
lai3>     { -ial > -   }
lau3>     { -ual > -   }
la2>      { -al > -    }
ll1.      { -ll > -l   }
mui3.     { -ium > -   }
mu*2.     { -um > -   if intact }
msi3>     { -ism > -   }
mm1.      { -mm > -m   }
nois4j>   { -sion > -j }
noix4ct.  { -xion > -ct }
noi3>     { -ion > -   }
nai3>     { -ian > -   }
na2>      { -an > -    }
nee0.     { protect  -een }
ne2>      { -en > -    }
nn1.      { -nn > -n   }
pihs4>    { -ship > -  }
pp1.      { -pp > -p   }
re2>      { -er > -    }
rae0.     { protect  -ear }
ra2.      { -ar > -    }
ro2>      { -or > -    }
ru2>      { -ur > -    }
rr1.      { -rr > -r   }
rt1>      { -tr > -t   }
rei3y>    { -ier > -y  }
sei3y>    { -ies > -y  }
sis2.     { -sis > -s  }
si2>      { -is > -    }
ssen4>    { -ness > -  }
ss0.      { protect  -ss }
suo3>     { -ous > -   }
su*2.     { -us > -   if intact }
s*1>      { -s > -    if intact }
s0.       { -s > -s    }
tacilp4y. { -plicat > -ply }
ta2>      { -at > -    }
tnem4>    { -ment > -  }
tne3>     { -ent > -   }
tna3>     { -ant > -   }
tpir2b.   { -ript > -rib }
tpro2b.   { -orpt > -orb }
tcud1.    { -duct > -duc }
tpmus2.   { -sumpt > -sum }
tpec2iv.  { -cept > -ceiv }
tulo2v.   { -olut > -olv }
tsis0.    { protect  -sist }
tsi3>     { -ist > -   }
tt1.      { -tt > -t   }
uqi3.     { -iqu > -   } 
ugo1.     { -ogu > -og }
vis3j>    { -siv > -j  }
vie0.     { protect  -eiv }
vi2>      { -iv > -    }
ylb1>     { -bly > -bl }
yli3y>    { -ily > -y  }
ylp0.     { protect  -ply }
yl2>      { -ly > -    }
ygo1.     { -ogy > -og }
yhp1.     { -phy > -ph }
ymo1.     { -omy > -om }
ypo1.     { -opy > -op }
yti3>     { -ity > -   }
yte3>     { -ety > -   }
ytl2.     { -lty > -l  }
yrtsi5.   { -istry > - }
yra3>     { -ary > -   }
yro3>     { -ory > -   }
yfi3.     { -ify > -   }
ycn2t>    { -ncy > -nt }
yca3>     { -acy > -   }
zi2>      { -iz > -    }
zy1s.     { -yz > -ys  }
"""

# Make the standard rules available as a module-level function
paragraph ="lived"
tokens = word_tokenize( paragraph )
stem = PaiceHuskStemmer(defaultrules)
st = LancasterStemmer()
print(st.stem('bought'))
print(stem.stem(paragraph))


# print '\n'.join( stem.rules.a )
# exit()
# print(stem.stem(tokens))
#
# print( [stem.stem(token) for token in tokens] )