#include "platform.h" #include #include #include #include "metaphone.h" //////////////////////////////////////////////////////////////////////////////// // Double Metaphone (c) 1998, 1999 by Lawrence Philips // // Slightly modified by Kevin Atkinson to fix several bugs and // to allow it to give back more than 4 characters. // // 13-Dec-00 mtw Modified to return a number (e.g. 77th returns 77) // // Placed in the public domain by Lawrence Philips // //////////////////////////////////////////////////////////////////////////////// #include "metaphone.h" #include #define AND && #define OR || namespace nsDmetaphone { //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// MString::MString() { } //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// MString::MString(const char* in) : cString(in) { } //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// MString::MString(const cString& in) : cString(in) { } //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// bool MString::SlavoGermanic() { return (Find('W') OR Find('K') OR Find("CZ") OR Find("WITZ")); } //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// inline void MString::MetaphAdd(const char* main) { primary.Cat(main); secondary.Cat(main); } //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// inline void MString::MetaphAdd(const char main) { primary += main; secondary += main; } //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// inline void MString::MetaphAdd(const char* main, const char* alt) { if(*main) primary.Cat(main); if(*alt) { alternate = true; if(alt[0] != ' ') secondary.Cat(alt); }else if(*main AND (main[0] != ' ')) secondary.Cat(main); } //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// bool MString::IsVowel(int at) { if((at < 0) OR (at >= length)) return false; char it = GetAt(at); if((it == 'A') OR (it == 'E') OR (it == 'I') OR (it == 'O') OR (it == 'U') OR (it == 'Y') ) return true; return false; } //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// bool MString::StringAt(int start, int len, ... ) { if (start < 0) return false; char target[64]; char* test; if (Len - start < len) { return false; } memcpy( target, Ptr + start, len ); target[len] = 0; va_list sstrings; va_start(sstrings, len); do { test = va_arg(sstrings, char*); if(*test AND (strcmp(target, test) == 0)) return true; }while(strcmp(test, "")); va_end(sstrings); return false; } //////////////////////////////////////////////////////////////////////////////// // main deal //////////////////////////////////////////////////////////////////////////////// void MString::DoubleMetaphone(cString &metaph, cString &metaph2) { int current = 0; length = Len; if(length < 1) return; last = length - 1;//zero based index alternate = false; primary = ""; secondary = ""; Upper(); //pad the original string so that we can index beyond the edge of the world Cat(" "); //skip these when at start of word if(StringAt(0, 2, "GN", "KN", "PN", "WR", "PS", "")) current += 1; //Initial 'X' is pronounced 'Z' e.g. 'Xavier' if(GetAt(0) == 'X') { MetaphAdd('S'); //'Z' maps to 'S' current += 1; } if (isdigit(GetAt(0))) { while (isdigit(GetAt(current)) && current < length) { MetaphAdd(GetAt(current)); current++; } } else while(true OR (primary.Len < 4) OR (secondary.Len < 4)) ///////////main loop////////////////////////// { if(current >= length) break; switch(GetAt(current)) { case 'A': case 'E': case 'I': case 'O': case 'U': case 'Y': if(current == 0) //all init vowels now map to 'A' MetaphAdd('A'); current +=1; break; case 'B': //"-mb", e.g", "dumb", already skipped over... MetaphAdd('P'); if(GetAt(current + 1) == 'B') current +=2; else current +=1; break; case '\307': // ascii 0xc7 = C with cedilla MetaphAdd('S'); current += 1; break; case 'C': //various germanic if((current > 1) AND !IsVowel(current - 2) AND StringAt((current - 1), 3, "ACH", "") AND ((GetAt(current + 2) != 'I') AND ((GetAt(current + 2) != 'E') OR StringAt((current - 2), 6, "BACHER", "MACHER", "")) )) { MetaphAdd('K'); current +=2; break; } //special case 'caesar' if((current == 0) AND StringAt(current, 6, "CAESAR", "")) { MetaphAdd('S'); current +=2; break; } //italian 'chianti' if(StringAt(current, 4, "CHIA", "")) { MetaphAdd('K'); current +=2; break; } if(StringAt(current, 2, "CH", "")) { //find 'michael' if((current > 0) AND StringAt(current, 4, "CHAE", "")) { MetaphAdd("K", "X"); current +=2; break; } //greek roots e.g. 'chemistry', 'chorus' if((current == 0) AND (StringAt((current + 1), 5, "HARAC", "HARIS", "") OR StringAt((current + 1), 3, "HOR", "HYM", "HIA", "HEM", "")) AND !StringAt(0, 5, "CHORE", "")) { MetaphAdd('K'); current +=2; break; } //germanic, greek, or otherwise 'ch' for 'kh' sound if((StringAt(0, 4, "VAN ", "VON ", "") OR StringAt(0, 3, "SCH", "")) // 'architect but not 'arch', 'orchestra', 'orchid' OR StringAt((current - 2), 6, "ORCHES", "ARCHIT", "ORCHID", "") OR StringAt((current + 2), 1, "T", "S", "") OR ((StringAt((current - 1), 1, "A", "O", "U", "E", "") OR (current == 0)) //e.g., 'wachtler', 'wechsler', but not 'tichner' AND StringAt((current + 2), 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", ""))) { MetaphAdd('K'); }else{ if(current > 0) { if(StringAt(0, 2, "MC", "")) //e.g., "McHugh" MetaphAdd('K'); else MetaphAdd("X", "K"); }else MetaphAdd('X'); } current +=2; break; } //e.g, 'czerny' if(StringAt(current, 2, "CZ", "") AND !StringAt((current - 2), 4, "WICZ", "")) { MetaphAdd("S", "X"); current += 2; break; } //e.g., 'focaccia' if(StringAt((current + 1), 3, "CIA", "")) { MetaphAdd('X'); current += 3; break; } //double 'C', but not if e.g. 'McClellan' if(StringAt(current, 2, "CC", "") AND !((current == 1) AND (GetAt(0) == 'M'))) { //'bellocchio' but not 'bacchus' if(StringAt((current + 2), 1, "I", "E", "H", "") AND !StringAt((current + 2), 2, "HU", "")) { //'accident', 'accede' 'succeed' if(((current == 1) AND (GetAt(current - 1) == 'A')) OR StringAt((current - 1), 5, "UCCEE", "UCCES", "")) MetaphAdd("KS"); //'bacci', 'bertucci', other italian else MetaphAdd('X'); current += 3; break; }else{//Pierce's rule MetaphAdd('K'); current += 2; break; } } if(StringAt(current, 2, "CK", "CG", "CQ", "")) { MetaphAdd('K'); current += 2; break; } if(StringAt(current, 2, "CI", "CE", "CY", "")) { //italian vs. english if(StringAt(current, 3, "CIO", "CIE", "CIA", "")) MetaphAdd("S", "X"); else MetaphAdd('S'); current += 2; break; } //else MetaphAdd('K'); //name sent in 'mac caffrey', 'mac gregor if(StringAt((current + 1), 2, " C", " Q", " G", "")) current += 3; else if(StringAt((current + 1), 1, "C", "K", "Q", "") AND !StringAt((current + 1), 2, "CE", "CI", "")) current += 2; else current += 1; break; case 'D': if(StringAt(current, 2, "DG", "")) { if(StringAt((current + 2), 1, "I", "E", "Y", "")) { //e.g. 'edge' MetaphAdd('J'); current += 3; break; }else{ //e.g. 'edgar' MetaphAdd("TK"); current += 2; break; } } if(StringAt(current, 2, "DT", "DD", "")) { MetaphAdd('T'); current += 2; break; } //else MetaphAdd('T'); current += 1; break; case 'F': if(GetAt(current + 1) == 'F') current += 2; else current += 1; MetaphAdd('F'); break; case 'G': if(GetAt(current + 1) == 'H') { if((current > 0) AND !IsVowel(current - 1)) { MetaphAdd('K'); current += 2; break; } if(current < 3) { //'ghislane', ghiradelli if(current == 0) { if(GetAt(current + 2) == 'I') MetaphAdd('J'); else MetaphAdd('K'); current += 2; break; } } //Parker's rule (with some further refinements) - e.g., 'hugh' if(((current > 1) AND StringAt((current - 2), 1, "B", "H", "D", "") ) //e.g., 'bough' OR ((current > 2) AND StringAt((current - 3), 1, "B", "H", "D", "") ) //e.g., 'broughton' OR ((current > 3) AND StringAt((current - 4), 1, "B", "H", "") ) ) { current += 2; break; }else{ //e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' if((current > 2) AND (GetAt(current - 1) == 'U') AND StringAt((current - 3), 1, "C", "G", "L", "R", "T", "") ) { MetaphAdd('F'); }else if((current > 0) AND GetAt(current - 1) != 'I') MetaphAdd('K'); current += 2; break; } } if(GetAt(current + 1) == 'N') { if((current == 1) AND IsVowel(0) AND !SlavoGermanic()) { MetaphAdd("KN", "N"); }else //not e.g. 'cagney' if(!StringAt((current + 2), 2, "EY", "") AND (GetAt(current + 1) != 'Y') AND !SlavoGermanic()) { MetaphAdd("N", "KN"); }else MetaphAdd("KN"); current += 2; break; } //'tagliaro' if(StringAt((current + 1), 2, "LI", "") AND !SlavoGermanic()) { MetaphAdd("KL", "L"); current += 2; break; } //-ges-,-gep-,-gel-, -gie- at beginning if((current == 0) AND ((GetAt(current + 1) == 'Y') OR StringAt((current + 1), 2, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER", "")) ) { MetaphAdd("K", "J"); current += 2; break; } // -ger-, -gy- if((StringAt((current + 1), 2, "ER", "") OR (GetAt(current + 1) == 'Y')) AND !StringAt(0, 6, "DANGER", "RANGER", "MANGER", "") AND !StringAt((current - 1), 1, "E", "I", "") AND !StringAt((current - 1), 3, "RGY", "OGY", "") ) { MetaphAdd("K", "J"); current += 2; break; } // italian e.g, 'biaggi' if(StringAt((current + 1), 1, "E", "I", "Y", "") OR StringAt((current - 1), 4, "AGGI", "OGGI", "")) { //obvious germanic if((StringAt(0, 4, "VAN ", "VON ", "") OR StringAt(0, 3, "SCH", "")) OR StringAt((current + 1), 2, "ET", "")) MetaphAdd('K'); else //always soft if french ending if(StringAt((current + 1), 4, "IER ", "")) MetaphAdd('J'); else MetaphAdd("J", "K"); current += 2; break; } if(GetAt(current + 1) == 'G') current += 2; else current += 1; MetaphAdd('K'); break; case 'H': //only keep if first & before vowel or btw. 2 vowels if(((current == 0) OR IsVowel(current - 1)) AND IsVowel(current + 1)) { MetaphAdd('H'); current += 2; }else//also takes care of 'HH' current += 1; break; case 'J': //obvious spanish, 'jose', 'san jacinto' if(StringAt(current, 4, "JOSE", "") OR StringAt(0, 4, "SAN ", "") ) { if(((current == 0) AND (GetAt(current + 4) == ' ')) OR StringAt(0, 4, "SAN ", "") ) MetaphAdd('H'); else { MetaphAdd("J", "H"); } current +=1; break; } if((current == 0) AND !StringAt(current, 4, "JOSE", "")) MetaphAdd("J", "A");//Yankelovich/Jankelowicz else //spanish pron. of e.g. 'bajador' if(IsVowel(current - 1) AND !SlavoGermanic() AND ((GetAt(current + 1) == 'A') OR (GetAt(current + 1) == 'O'))) MetaphAdd("J", "H"); else if(current == last) MetaphAdd("J", " "); else if(!StringAt((current + 1), 1, "L", "T", "K", "S", "N", "M", "B", "Z", "") AND !StringAt((current - 1), 1, "S", "K", "L", "")) MetaphAdd('J'); if(GetAt(current + 1) == 'J')//it could happen! current += 2; else current += 1; break; case 'K': if(GetAt(current + 1) == 'K') current += 2; else current += 1; MetaphAdd('K'); break; case 'L': if(GetAt(current + 1) == 'L') { //spanish e.g. 'cabrillo', 'gallegos' if(((current == (length - 3)) AND StringAt((current - 1), 4, "ILLO", "ILLA", "ALLE", "")) OR ((StringAt((last - 1), 2, "AS", "OS", "") OR StringAt(last, 1, "A", "O", "")) AND StringAt((current - 1), 4, "ALLE", "")) ) { MetaphAdd("L", " "); current += 2; break; } current += 2; }else current += 1; MetaphAdd('L'); break; case 'M': if((StringAt((current - 1), 3, "UMB", "") AND (((current + 1) == last) OR StringAt((current + 2), 2, "ER", ""))) //'dumb','thumb' OR (GetAt(current + 1) == 'M') ) current += 2; else current += 1; MetaphAdd('M'); break; case 'N': if(GetAt(current + 1) == 'N') current += 2; else current += 1; MetaphAdd('N'); break; case '\321': // Ascii 0xD1 = capital N with tilde current += 1; MetaphAdd('N'); break; case 'P': if(GetAt(current + 1) == 'H') { MetaphAdd('F'); current += 2; break; } //also account for "campbell", "raspberry" if(StringAt((current + 1), 1, "P", "B", "")) current += 2; else current += 1; MetaphAdd('P'); break; case 'Q': if(GetAt(current + 1) == 'Q') current += 2; else current += 1; MetaphAdd('K'); break; case 'R': //french e.g. 'rogier', but exclude 'hochmeier' if((current == last) AND !SlavoGermanic() AND StringAt((current - 2), 2, "IE", "") AND !StringAt((current - 4), 2, "ME", "MA", "")) MetaphAdd("", "R"); else MetaphAdd('R'); if(GetAt(current + 1) == 'R') current += 2; else current += 1; break; case 'S': //special cases 'island', 'isle', 'carlisle', 'carlysle' if(StringAt((current - 1), 3, "ISL", "YSL", "")) { current += 1; break; } //special case 'sugar-' if((current == 0) AND StringAt(current, 5, "SUGAR", "")) { MetaphAdd("X", "S"); current += 1; break; } if(StringAt(current, 2, "SH", "")) { //germanic if(StringAt((current + 1), 4, "HEIM", "HOEK", "HOLM", "HOLZ", "")) MetaphAdd('S'); else MetaphAdd('X'); current += 2; break; } //italian & armenian if(StringAt(current, 3, "SIO", "SIA", "") OR StringAt(current, 4, "SIAN", "")) { if(!SlavoGermanic()) MetaphAdd("S", "X"); else MetaphAdd('S'); current += 3; break; } //german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' //also, -sz- in slavic language altho in hungarian it is pronounced 's' if(((current == 0) AND StringAt((current + 1), 1, "M", "N", "L", "W", "")) OR StringAt((current + 1), 1, "Z", "")) { MetaphAdd("S", "X"); if(StringAt((current + 1), 1, "Z", "")) current += 2; else current += 1; break; } if(StringAt(current, 2, "SC", "")) { //Schlesinger's rule if(GetAt(current + 2) == 'H') { //dutch origin, e.g. 'school', 'schooner' if(StringAt((current + 3), 2, "OO", "ER", "EN", "UY", "ED", "EM", "")) { //'schermerhorn', 'schenker' if(StringAt((current + 3), 2, "ER", "EN", "")) { MetaphAdd("X", "SK"); }else MetaphAdd("SK"); current += 3; break; }else{ if((current == 0) AND !IsVowel(3) AND (GetAt(3) != 'W')) MetaphAdd("X", "S"); else MetaphAdd('X'); current += 3; break; } } if(StringAt((current + 2), 1, "I", "E", "Y", "")) { MetaphAdd('S'); current += 3; break; } //else MetaphAdd("SK"); current += 3; break; } //french e.g. 'resnais', 'artois' if((current == last) AND StringAt((current - 2), 2, "AI", "OI", "")) MetaphAdd("", "S"); else MetaphAdd('S'); if(StringAt((current + 1), 1, "S", "Z", "")) current += 2; else current += 1; break; case 'T': if(StringAt(current, 4, "TION", "")) { MetaphAdd('X'); current += 3; break; } if(StringAt(current, 3, "TIA", "TCH", "")) { MetaphAdd('X'); current += 3; break; } if(StringAt(current, 2, "TH", "") OR StringAt(current, 3, "TTH", "")) { //special case 'thomas', 'thames' or germanic if(StringAt((current + 2), 2, "OM", "AM", "") OR StringAt(0, 4, "VAN ", "VON ", "") OR StringAt(0, 3, "SCH", "")) { MetaphAdd('T'); }else{ MetaphAdd("0", "T"); } current += 2; break; } if(StringAt((current + 1), 1, "T", "D", "")) current += 2; else current += 1; MetaphAdd('T'); break; case 'V': if(GetAt(current + 1) == 'V') current += 2; else current += 1; MetaphAdd('F'); break; case 'W': //can also be in middle of word if(StringAt(current, 2, "WR", "")) { MetaphAdd('R'); current += 2; break; } if((current == 0) AND (IsVowel(current + 1) OR StringAt(current, 2, "WH", ""))) { //Wasserman should match Vasserman if(IsVowel(current + 1)) MetaphAdd("A", "F"); else //need Uomo to match Womo MetaphAdd('A'); } //Arnow should match Arnoff if(((current == last) AND IsVowel(current - 1)) OR StringAt((current - 1), 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY", "") OR StringAt(0, 3, "SCH", "")) { MetaphAdd("", "F"); current +=1; break; } //polish e.g. 'filipowicz' if(StringAt(current, 4, "WICZ", "WITZ", "")) { MetaphAdd("TS", "FX"); current +=4; break; } //else skip it current +=1; break; case 'X': //french e.g. breaux if(!((current == last) AND (StringAt((current - 3), 3, "IAU", "EAU", "") OR StringAt((current - 2), 2, "AU", "OU", ""))) ) MetaphAdd("KS"); if(StringAt((current + 1), 1, "C", "X", "")) current += 2; else current += 1; break; case 'Z': //chinese pinyin e.g. 'zhao' if(GetAt(current + 1) == 'H') { MetaphAdd('J'); current += 2; break; }else if(StringAt((current + 1), 2, "ZO", "ZI", "ZA", "") OR (SlavoGermanic() AND ((current > 0) AND GetAt(current - 1) != 'T'))) { MetaphAdd("S", "TS"); } else MetaphAdd('S'); if(GetAt(current + 1) == 'Z') current += 2; else current += 1; break; default: current += 1; } } metaph = primary.Ptr; //only give back 4 char metaph //if(metaph.Len > 4) // metaph.SetAt(4,'\0'); metaph2 = secondary.Ptr; //if(metaph2.Len > 4) // metaph2.SetAt(4,'\0'); } }//namespace