#!/usr/local/bin/perl -w # # Perl implementation of the porter stemming algorithm # described in the paper: "An algorithm for suffix stripping, M F Porter" # http://www.muscat.com/~martin/stem.html # # Daniel van Balen (vdaniel@ldc.usb.ve) # # October-1999 # # To Use: # # Put the line "use porter;" in your code. This will import the subroutine # porter into your current name space (by default this is Main:: ). Make # sure this file, "porter.pm" is in your @INC path (it includes the current # directory). # Afterwards use by calling "porter()" where is the word to strip. # The stripped word will be the returned value. # # REMEMBER TO CHANGE THE FIRST LINE TO POINT TO THE PATH TO YOUR PERL # BINARY # # We need the lookbehind re operator "(?<= ... )" and "(?= perl5.005 require 5.005; sub porter{ # First and only argument. The word to be stemmed. should be lower-case # an "i" could be apended to the regular expresions to make them case # insensitive. my $word=shift; if(length($word)>2){ # This is a consonant. Not "aiueo" and "y" only if preceded by a vowel my $c='(?:[^aiueoy]|(?:(?<=[aiueo])y)|\by)'; #reconoce una consonante # This is a vowel. "aiueo" and "y" if preceded by a consonant my $v='(?:[aiueo]|(?:(? 0) my $m_gt_0 = "^(?:$c+)?(?:$v+$c+){1,}(?:$v+)?\$"; # Matches if (m > 1) my $m_gt_1 = "^(?:$c+)?(?:$v+$c+){2,}(?:$v+)?\$"; # Matches if (m = 1) my $m_eq_1="^(?:$c+)?(?:$v+$c+){1}(?:$v+)?\$"; # Matches *o my $o="$c$v(?:[^aiueowxy])\$"; # Matches *d my $d="($c)\\1\$"; #STEP 1a if($word =~ /(.+)sses$/){ $word=$1."ss"; } elsif($word =~ /(.+)ies$/){ $word=$1."i"; } elsif($word =~ /(.+[^s])s$/){ # engloba 2 ultimas reglas de 1a $word=$1; # Same as last 2 rules of 1a } #STEP 1b if($word =~ /(.+)eed$/) { if (($w=$1) =~ /$m_gt_0/o){ $word=$w."ee"; } } elsif($word =~ /(.+)ed$/) { if (($w=$1) =~ /$v/o) { $word=$w; $extra=1; } } elsif($word =~ /(.+)ing$/){ if(($w=$1) =~ /$v/o) { $word=$w; $extra=1; } } # If 2nd or 3rd of the previous rules was successful try the extra rules... #Si aplicaron alguna de las dos ultimas reglas de "1b" hacemos las siguientes if($extra){ if($word =~ /(.+)at$/){ $word=$1."ate"; } elsif($word =~ /(.+)bl$/){ $word=$1."ble"; } elsif($word =~ /(.+)iz$/){ $word=$1."ize"; } # (*d and not (*L or *S or *Z)) --> single letter elsif(($word =~ /$d/o) and ($word !~ /[lsz]$/)){ $word=substr($word,0,-1); } # (m=1 and *o) --> E elsif(($word =~ /$m_eq_1/o) and ($word =~ /$o/o)){ $word.='e'; } } # STEP 1c if($word =~ /(.+)y$/){ if (($w=$1) =~ /$v/o){ $word=$w."i"; } } #STEP 2 con crazy performance hack descrito en paper # To speed up the algorithm we do a switch on the penultimate letter of the # word being tested. Same for steps 3 and 4. my $letter=substr($word,-2,1); if($letter eq "a"){ if($word =~ /(.+)ational$/){ if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ate"; } } elsif($word =~ /(.+)tional$/){ if(($w=$1) =~ /$m_gt_0/o){ $word=$w."tion"; } } } elsif($letter eq "c"){ if($word =~ /(.+)enci$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ence"; } } elsif($word =~ /(.+)anci$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ance"; } } } elsif($letter eq "e"){ if($word =~ /(.+)izer$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ize"; } } } #NEW RULE SEE "http://www.muscat.com/~martin/stem.html" elsif($letter eq "g"){ if($word =~ /(.+)logi$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."log"; } } } elsif($letter eq "l"){ #RULE CHANGED SEE "http://www.muscat.com/~martin/stem.html" if($word =~ /(.+)bli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ble"; } } elsif($word =~ /(.+)alli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."al"; } } elsif($word =~ /(.+)entli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ent"; } } elsif($word =~ /(.+)eli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."e"; } } elsif($word =~ /(.+)ousli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ous"; } } } elsif($letter eq "o"){ if($word =~ /(.+)ization$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ize"; } } elsif($word =~ /(.+)ation$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ate"; } } elsif($word =~ /(.+)ator$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ate"; } } } elsif($letter eq "s"){ if($word =~ /(.+)alism$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."al"; } } elsif($word =~ /(.+)iveness$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ive"; } } elsif($word =~ /(.+)fulness$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ful"; } } elsif($word =~ /(.+)ousness$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ous"; } } } elsif($letter eq "t"){ if($word =~ /(.+)aliti$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."al"; } } elsif($word =~ /(.+)iviti$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ive"; } } elsif($word =~ /(.+)biliti$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ble"; } } } #STEP 3 $letter=substr($word,-1,1); if($letter eq "e"){ if($word =~ /(.+)icate$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ic"; } } elsif($word =~ /(.+)ative$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w; } } elsif($word =~ /(.+)alize$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."al"; } } } elsif($letter eq "i"){ if($word =~ /(.+)iciti$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ic"; } } } elsif($letter eq "l"){ if($word =~ /(.+)ical$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ic"; } } elsif($word =~ /(.+)ful$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w; } } } elsif($letter eq "s"){ if($word =~ /(.+)ness$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w; } } } #STEP 4 $letter=substr($word,-2,1); if($letter eq "a"){ if($word =~ /(.+)al$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "c"){ if($word =~ /(.+)ance$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ence$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "e"){ if($word =~ /(.+)er$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "i"){ if($word =~ /(.+)ic$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "l"){ if($word =~ /(.+)able$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ible$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "n"){ if($word =~ /(.+)ant$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ement$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ment$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ent$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "o"){ if($word =~ /(.+)ion$/) { if((($w=$1) =~ /[st]$/) and ($w =~ /$m_gt_1/o)){ $word=$w; } } elsif($word =~ /(.+)ou$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "s"){ if($word =~ /(.+)ism$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "t"){ if($word =~ /(.+)ate$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)iti$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "u"){ if($word =~ /(.+)ous$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "v"){ if($word =~ /(.+)ive$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "z"){ if($word =~ /(.+)ize$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } #STEP 5a if($word =~ /(.+)e$/) { if((($w=$1) =~ /$m_gt_1/o) or (($w =~ /$m_eq_1/o) and ($w !~ /$o/o))){ $word=$w; } } #STEP 5b #(m>1 and *d and *L) --> if($word =~ /ll$/) { if($word =~ /$m_gt_1/o){ $word=substr($word,0,length($word)-1); } } # It's stemmed so I guess we can give it back :-) } return $word; }