Pārlūkot izejas kodu

Merge pull request #6833 from richardkchapman/move-metaphone

HPCC-12810 Include dmetaphone in the OSS version of HPCC Systems

Reviewed-by: Gavin Halliday <ghalliday@hpccsystems.com>
Gavin Halliday 10 gadi atpakaļ
vecāks
revīzija
851e845055

+ 41 - 0
ecllibrary/std/Metaphone.ecl

@@ -0,0 +1,41 @@
+/*##############################################################################
+## HPCC SYSTEMS software Copyright (C) 2015 HPCC Systems.  All rights reserved.
+############################################################################## */
+
+
+EXPORT Metaphone := MODULE
+
+
+IMPORT lib_metaphone;
+
+/**
+ * Returns the primary metaphone value
+ *
+ * @param src           The string whose metphone is to be calculated.
+ * @see                 http://en.wikipedia.org/wiki/Metaphone#Double_Metaphone
+ */
+
+EXPORT String primary(STRING src) :=
+  lib_metaphone.MetaphoneLib.DMetaphone1(src);
+
+/**
+ * Returns the secondary metaphone value
+ *
+ * @param src           The string whose metphone is to be calculated.
+ * @see                 http://en.wikipedia.org/wiki/Metaphone#Double_Metaphone
+ */
+
+EXPORT String secondary(STRING src) :=
+  lib_metaphone.MetaphoneLib.DMetaphone2(src);
+
+/**
+ * Returns the double metaphone value (primary and secondary concatenated
+ *
+ * @param src           The string whose metphone is to be calculated.
+ * @see                 http://en.wikipedia.org/wiki/Metaphone#Double_Metaphone
+ */
+
+EXPORT String double(STRING src) :=
+  lib_metaphone.MetaphoneLib.DMetaphoneBoth(src);
+
+END;

+ 15 - 0
ecllibrary/teststd/Metaphone/TestMetaphone.ecl

@@ -0,0 +1,15 @@
+/*##############################################################################
+## HPCC SYSTEMS software Copyright (C) 2015 HPCC Systems.  All rights reserved.
+############################################################################## */
+IMPORT Std.Metaphone;
+
+EXPORT TestMetaphone := MODULE
+
+  EXPORT TestConst := MODULE
+    EXPORT Test01 := ASSERT(Metaphone.primary('Algernon') = 'ALKRNN');
+    EXPORT Test02 := ASSERT(Metaphone.secondary('Algernon') = 'ALJRNN');
+    EXPORT Test03 := ASSERT(Metaphone.double('Algernon') = 'ALKRNNALJRNN');
+  END;
+
+  EXPORT Main := [EVALUATE(TestConst)];
+END;

+ 1 - 0
plugins/CMakeLists.txt

@@ -15,6 +15,7 @@
 ################################################################################
 add_subdirectory (auditlib)
 add_subdirectory (debugservices)
+add_subdirectory (dmetaphone)
 add_subdirectory (fileservices)
 add_subdirectory (logging)
 add_subdirectory (parselib)

+ 215 - 0
plugins/common/cstring.h

@@ -0,0 +1,215 @@
+//  cstrings.h  support for C++ cString class
+//
+#ifndef _CSTRING_H
+#define _CSTRING_H
+
+#include <string.h>
+#include <ctype.h>
+
+#define CSINITIALSIZE 8         // size of statically allocated string buffer
+
+class cString
+{
+private:
+    int     iBufLen;        // current buffer length
+    char    buffer[CSINITIALSIZE];
+
+protected:
+    void Set(const char *Str) {
+        if (!Str) {
+            // treat null address as a null string
+            Len = 0;
+            *Ptr = '\0';
+            return;
+        }
+        Len = strlen( Str );
+        if (Len >= iBufLen && Len + 1 > CSINITIALSIZE) {
+            if (iBufLen != CSINITIALSIZE) {
+                delete [] Ptr;
+            }
+            iBufLen = Len + 1;
+            Ptr = new char[iBufLen];
+        }
+        memcpy( Ptr, Str, Len+1 );
+    }
+
+public:
+    int     Len;            // current string length
+    char    *Ptr;           // current string buffer
+    cString() {
+        iBufLen = CSINITIALSIZE;
+        Len = 0;
+        Ptr = buffer;
+        *Ptr = '\0';
+    }
+
+    cString(const char *Str) {
+        Len = strlen( Str );
+        iBufLen = Len + 1;
+        if (iBufLen <= CSINITIALSIZE) {
+            iBufLen = CSINITIALSIZE;
+            Ptr = buffer;
+        } else {
+            Ptr = new char[ iBufLen ];
+        }
+        memcpy( Ptr, Str, Len+1 );
+    }
+
+    cString(const cString &cStr) {
+        Len = cStr.Len;
+        iBufLen = Len + 1;
+        if (iBufLen <= CSINITIALSIZE) {
+            iBufLen = CSINITIALSIZE;
+            Ptr = buffer;
+        } else {
+            Ptr = new char[ iBufLen ];
+        }
+        memcpy( Ptr, cStr.Ptr, Len+1 );
+    }
+
+    inline ~cString() {
+        if (iBufLen != CSINITIALSIZE) {
+            delete [] Ptr;
+        }
+    }
+
+    void Set(const char *Str, int len) {
+        Len = len;
+        if (Len >= iBufLen && Len + 1 > CSINITIALSIZE) {
+            if (iBufLen != CSINITIALSIZE) {
+                delete [] Ptr;
+            }
+            iBufLen = Len + 1;
+            Ptr = new char[iBufLen];
+        }
+        memcpy( Ptr, Str, Len );
+        Ptr[Len] = 0;
+    }
+
+    void SetLength(int len) {
+        Len = len;
+        if (Len >= iBufLen && Len + 1 > CSINITIALSIZE) {
+            if (iBufLen != CSINITIALSIZE) {
+                delete [] Ptr;
+            }
+            iBufLen = Len + 1;
+            Ptr = new char[iBufLen];
+        }
+    }
+
+
+    void Trim() {
+        if (Len) {
+            char    *sp;
+            sp = Ptr + Len - 1;
+            while (Len) {
+                if (*sp == ' ') {
+                    Len--;
+                    sp--;
+                } else {
+                    break;
+                }
+            }
+            *(sp+1) = 0;
+        }
+    }
+
+    void Upper() {
+        char *sp = Ptr;
+        char *ep = Ptr + Len;
+
+        while (sp < ep) {
+            *sp = toupper(*sp);
+            sp++;
+        }
+    }
+
+    void Lower() {
+        char *sp = Ptr;
+        char *ep = Ptr + Len;
+
+        while (sp < ep) {
+            *sp = tolower(*sp);
+            sp++;
+        }
+    }
+
+    // Concatenate a string to the existing string
+    void Cat(const char *string, int tlen) {
+        if (Len + tlen >= iBufLen && Len + tlen + 1 >= CSINITIALSIZE) {
+            char *tPtr = new char[Len + tlen + 1];
+            memcpy( tPtr, Ptr, Len );
+            if(iBufLen != CSINITIALSIZE)
+                delete [] Ptr;
+            iBufLen = Len + tlen + 1;
+            Ptr = tPtr;
+            Ptr[Len] = 0;
+        }
+        memcpy( Ptr + Len, string, tlen );
+        Len += tlen;
+        Ptr[Len] = 0;
+    }
+
+    inline void Cat(cString &string) {
+        Cat(string.Ptr, string.Len);
+    }
+
+    inline void Cat( const char *string ) {
+        Cat( string, strlen(string));
+    }
+
+    inline cString& operator =(const char *Str) {
+        Set(Str);
+        return(*this);
+    }
+
+    inline cString& operator=(cString &cStr) {
+        Set(cStr.Ptr,cStr.Len);
+        return *this;
+    }
+
+    cString& operator+=(cString &cStr) {
+        if (Len + cStr.Len >= iBufLen && Len + cStr.Len + 1 > CSINITIALSIZE) {
+            char *tPtr = new char[iBufLen + cStr.Len + 1];
+            memcpy( tPtr, Ptr, Len );
+            if(iBufLen != CSINITIALSIZE)
+                delete [] Ptr;
+            iBufLen = Len + cStr.Len + 1;
+            Ptr = tPtr;
+        }
+        memcpy(Ptr+Len,cStr.Ptr,cStr.Len+1);
+        Len += cStr.Len;
+        return *this;
+    }
+
+    char *operator+=(const char *Str) {
+        int slen = strlen(Str);
+        if (Len + slen >= iBufLen && Len + slen + 1 > CSINITIALSIZE) {
+            char *tPtr = new char[iBufLen + slen + 1];
+            memcpy( tPtr, Ptr, Len );
+            if(iBufLen != CSINITIALSIZE)
+                delete [] Ptr;
+            iBufLen = Len + slen + 1;
+            Ptr = tPtr;
+        }
+        memcpy(Ptr+Len, Str, slen+1);
+        Len += slen;;
+        return Ptr;
+    }
+
+    cString& operator+=(const char ch)  {
+        if (Len + 1 < iBufLen) {
+            Ptr[Len++] = ch;
+            Ptr[Len] = '\0';
+        } else {
+            Cat(&ch, 1);
+        }
+        return(*this);
+    }
+
+    inline operator char*() {
+        return(Ptr);
+    }
+};
+
+#endif  //_CSTRING_H

+ 30 - 0
plugins/dmetaphone/CMakeLists.txt

@@ -0,0 +1,30 @@
+# Component: dmetaphone
+
+#####################################################
+# Description:
+# ------------
+#    Cmake Input File for dmetaphone
+#####################################################
+
+set ( toolsdir "${HPCC_SOURCE_DIR}/tools" )
+
+
+project( dmetaphone )
+
+set (    SRCS
+         dmetaphone.cpp
+         metaphone.cpp
+    )
+
+include_directories (
+         ${HPCC_SOURCE_DIR}/plugins/common
+         ${HPCC_SOURCE_DIR}/system/include
+    )
+
+ADD_DEFINITIONS( -D_USRDLL -DDMETAPHONE_EXPORTS )
+
+HPCC_ADD_LIBRARY( dmetaphone SHARED ${SRCS} )
+install ( TARGETS dmetaphone DESTINATION plugins )
+target_link_libraries ( dmetaphone
+         jlib
+    )

+ 118 - 0
plugins/dmetaphone/dmetaphone.cpp

@@ -0,0 +1,118 @@
+#include "platform.h"
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include "dmetaphone.hpp"
+#include "metaphone.h"
+
+#define DMETAPHONE_VERSION "DMETAPHONE 1.1.05"
+
+static const char * compatibleVersions[] = {
+    "DMETAPHONE 1.1.05 [0e64c86ec1d5771d4ce0abe488a98a2a]",
+    "DMETAPHONE 1.1.05",
+    NULL };
+
+DMETAPHONE_API bool getECLPluginDefinition(ECLPluginDefinitionBlock *pb)
+{
+    if (pb->size == sizeof(ECLPluginDefinitionBlockEx))
+    {
+        ECLPluginDefinitionBlockEx * pbx = (ECLPluginDefinitionBlockEx *) pb;
+        pbx->compatibleVersions = compatibleVersions;
+    }
+    else if (pb->size != sizeof(ECLPluginDefinitionBlock))
+        return false;
+
+    pb->magicVersion = PLUGIN_VERSION;
+    pb->version = DMETAPHONE_VERSION;
+    pb->moduleName = "lib_metaphone";
+    pb->ECL = NULL;  // Definition is in lib_metaphone.ecllib
+    pb->flags = PLUGIN_IMPLICIT_MODULE;
+    pb->description = "Metaphone library";
+    return true;
+}
+
+namespace nsDmetaphone {
+
+IPluginContext * parentCtx = NULL;
+
+}
+
+using namespace nsDmetaphone;
+
+DMETAPHONE_API void setPluginContext(IPluginContext * _ctx) { parentCtx = _ctx; }
+
+
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphone1(size32_t & __ret_len,char * & __ret_str,unsigned _len_instr,const char * instr)
+{
+    cString metaph;
+    cString metaph2;
+    MString ms;
+    ms.Set(instr, _len_instr);
+    ms.DoubleMetaphone(metaph, metaph2);
+    __ret_len = strlen((char*) metaph);
+    __ret_str = (char *) CTXMALLOC(parentCtx, __ret_len+1);
+    strcpy(__ret_str, (char*) metaph);
+}
+
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphone2(size32_t & __ret_len,char * & __ret_str,unsigned _len_instr,const char * instr)
+{
+    cString metaph;
+    cString metaph2;
+    MString ms;
+    ms.Set(instr, _len_instr);
+    ms.DoubleMetaphone(metaph, metaph2);
+    __ret_len = strlen((char*) metaph2);
+    __ret_str = (char *) CTXMALLOC(parentCtx, __ret_len+1);
+    strcpy(__ret_str, (char*) metaph2);
+}
+
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphoneBoth(size32_t & __ret_len,char * & __ret_str,unsigned _len_instr,const char * instr)
+{
+    cString metaph;
+    cString metaph2;
+    MString ms;
+    ms.Set(instr, _len_instr);
+    ms.DoubleMetaphone(metaph, metaph2);
+    __ret_len = strlen((char*) metaph) + strlen((char*) metaph2);
+    __ret_str = (char *) CTXMALLOC(parentCtx, __ret_len+1);
+    strcpy(__ret_str, (char*) metaph);
+    strcat(__ret_str, (char*) metaph2);
+}
+
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphone1_20(char * __ret_str,unsigned _len_instr,const char * instr)
+{
+    cString metaph;
+    cString metaph2;
+    MString ms;
+    ms.Set(instr, _len_instr);
+    ms.DoubleMetaphone(metaph, metaph2);
+    memset(__ret_str, ' ', 20);
+    size32_t metaph_len = strlen((char*) metaph);
+    strncpy(__ret_str, (char*) metaph, (metaph_len > 20)?20:metaph_len);
+}
+
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphone2_20(char * __ret_str,unsigned _len_instr,const char * instr)
+{
+    cString metaph;
+    cString metaph2;
+    MString ms;
+    ms.Set(instr, _len_instr);
+    ms.DoubleMetaphone(metaph, metaph2);
+    memset(__ret_str, ' ', 20);
+    size32_t metaph2_len = strlen((char*) metaph2);
+    strncpy(__ret_str, (char*) metaph2, (metaph2_len > 20)?20:metaph2_len);
+}
+
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphoneBoth_40(char * __ret_str,unsigned _len_instr,const char * instr)
+{
+    cString metaph;
+    cString metaph2;
+    MString ms;
+    ms.Set(instr, _len_instr);
+    ms.DoubleMetaphone(metaph, metaph2);
+    memset(__ret_str, ' ', 40);
+    size32_t metaph_len = strlen((char*) metaph);
+    strncpy(__ret_str, (char*) metaph, (metaph_len > 20)?20:metaph_len);
+    size32_t metaph2_len = strlen((char*) metaph2);
+    strncpy(__ret_str+metaph_len, (char*) metaph2, (metaph2_len > 20)?20:metaph2_len);
+}

+ 33 - 0
plugins/dmetaphone/dmetaphone.hpp

@@ -0,0 +1,33 @@
+#ifndef DMETAPHONE_INCL
+#define DMETAPHONE_INCL
+
+#ifdef _WIN32
+#define DMETAPHONE_CALL _cdecl
+#ifdef DMETAPHONE_EXPORTS
+#define DMETAPHONE_API __declspec(dllexport)
+#else
+#define DMETAPHONE_API __declspec(dllimport)
+#endif
+#else
+#define DMETAPHONE_CALL
+#define DMETAPHONE_API
+#endif
+
+#include "hqlplugins.hpp"
+
+extern "C" {
+
+#ifdef DMETAPHONE_EXPORTS
+DMETAPHONE_API bool getECLPluginDefinition(ECLPluginDefinitionBlock *pb);
+DMETAPHONE_API void setPluginContext(IPluginContext * _ctx);
+#endif
+
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphone1(size32_t & __ret_len,char * & __ret_str,unsigned _len_instr,const char * instr);
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphone2(size32_t & __ret_len,char * & __ret_str,unsigned _len_instr,const char * instr);
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphoneBoth(size32_t & __ret_len,char * & __ret_str,unsigned _len_instr,const char * instr);
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphone1_20(char * __ret_str,unsigned _len_instr,const char * instr);
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphone2_20(char * __ret_str,unsigned _len_instr,const char * instr);
+DMETAPHONE_API void DMETAPHONE_CALL mpDMetaphoneBoth_40(char * __ret_str,unsigned _len_instr,const char * instr);
+}
+
+#endif

+ 895 - 0
plugins/dmetaphone/metaphone.cpp

@@ -0,0 +1,895 @@
+#include "platform.h"
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include "metaphone.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Double Metaphone (c) 1998, 1999 by Lawrence Philips
+//
+// Slightly modified by Kevin Atkinson to fix several bugs and
+// to allow it to give back more than 4 characters.
+//
+//  13-Dec-00   mtw Modified to return a number (e.g. 77th returns 77)
+//
+// Placed in the public domain by Lawrence Philips
+//
+////////////////////////////////////////////////////////////////////////////////
+#include "metaphone.h"
+#include <ctype.h>
+
+#define AND &&
+#define OR ||
+
+namespace nsDmetaphone {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+MString::MString()
+{
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+MString::MString(const char* in) : cString(in)
+{
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+MString::MString(const cString& in) : cString(in)
+{
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+bool MString::SlavoGermanic()
+{
+    return (Find('W') OR Find('K') OR Find("CZ") OR Find("WITZ"));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+inline void MString::MetaphAdd(const char* main)
+{
+    primary.Cat(main);
+    secondary.Cat(main);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+inline void MString::MetaphAdd(const char main)
+{
+    primary += main;
+    secondary += main;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+inline void MString::MetaphAdd(const char* main, const char* alt)
+{
+    if(*main)
+        primary.Cat(main);
+    if(*alt)
+    {
+        alternate = true;
+        if(alt[0] != ' ')
+            secondary.Cat(alt);
+    }else
+        if(*main AND (main[0] != ' '))
+            secondary.Cat(main);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+bool MString::IsVowel(int at)
+{
+
+    if((at < 0) OR (at >= length))
+        return false;
+
+    char it = GetAt(at);
+
+    if((it == 'A') OR (it == 'E') OR (it == 'I') OR (it == 'O') OR (it == 'U') OR (it == 'Y') )
+        return true;
+
+    return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+bool MString::StringAt(int start, int len, ... )
+{
+
+    if (start < 0) return false;
+
+    char    target[64];
+    char*   test;
+
+    if (Len - start < len)
+    {
+        return false;
+    }
+    memcpy( target, Ptr + start, len );
+    target[len] = 0;
+
+    va_list sstrings;
+    va_start(sstrings, len);
+
+    do
+    {
+        test = va_arg(sstrings, char*);
+        if(*test AND (strcmp(target, test) == 0))
+            return true;
+
+    }while(strcmp(test, ""));
+
+    va_end(sstrings);
+
+    return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// main deal
+////////////////////////////////////////////////////////////////////////////////
+void MString::DoubleMetaphone(cString &metaph, cString &metaph2)
+{
+
+    int current = 0;
+
+    length = Len;
+    if(length < 1)
+        return;
+    last = length - 1;//zero based index
+
+    alternate = false;
+    primary = "";
+    secondary = "";
+
+    Upper();
+
+    //pad the original string so that we can index beyond the edge of the world
+    Cat("     ");
+
+    //skip these when at start of word
+    if(StringAt(0, 2, "GN", "KN", "PN", "WR", "PS", ""))
+        current += 1;
+
+    //Initial 'X' is pronounced 'Z' e.g. 'Xavier'
+    if(GetAt(0) == 'X')
+    {
+        MetaphAdd('S'); //'Z' maps to 'S'
+        current += 1;
+    }
+
+    if (isdigit(GetAt(0)))
+    {
+        while (isdigit(GetAt(current)) && current < length)
+        {
+            MetaphAdd(GetAt(current));
+            current++;
+        }
+    }
+    else while(true OR (primary.Len < 4) OR (secondary.Len < 4))
+        ///////////main loop//////////////////////////
+    {
+        if(current >= length)
+            break;
+
+        switch(GetAt(current))
+        {
+        case 'A':
+        case 'E':
+        case 'I':
+        case 'O':
+        case 'U':
+        case 'Y':
+            if(current == 0)
+                //all init vowels now map to 'A'
+                MetaphAdd('A');
+            current +=1;
+            break;
+
+        case 'B':
+
+            //"-mb", e.g", "dumb", already skipped over...
+            MetaphAdd('P');
+
+            if(GetAt(current + 1) == 'B')
+                current +=2;
+            else
+                current +=1;
+            break;
+
+        case '\307': // ascii 0xc7 = C with cedilla
+            MetaphAdd('S');
+            current += 1;
+            break;
+
+        case 'C':
+            //various germanic
+            if((current > 1)
+                AND !IsVowel(current - 2)
+                AND StringAt((current - 1), 3, "ACH", "")
+                AND ((GetAt(current + 2) != 'I') AND ((GetAt(current + 2) != 'E')
+                OR StringAt((current - 2), 6, "BACHER", "MACHER", "")) ))
+            {
+                MetaphAdd('K');
+                current +=2;
+                break;
+            }
+
+            //special case 'caesar'
+            if((current == 0) AND StringAt(current, 6, "CAESAR", ""))
+            {
+                MetaphAdd('S');
+                current +=2;
+                break;
+            }
+
+            //italian 'chianti'
+            if(StringAt(current, 4, "CHIA", ""))
+            {
+                MetaphAdd('K');
+                current +=2;
+                break;
+            }
+
+            if(StringAt(current, 2, "CH", ""))
+            {
+                //find 'michael'
+                if((current > 0) AND StringAt(current, 4, "CHAE", ""))
+                {
+                    MetaphAdd("K", "X");
+                    current +=2;
+                    break;
+                }
+
+                //greek roots e.g. 'chemistry', 'chorus'
+                if((current == 0)
+                    AND (StringAt((current + 1), 5, "HARAC", "HARIS", "")
+                    OR StringAt((current + 1), 3, "HOR", "HYM", "HIA", "HEM", ""))
+                    AND !StringAt(0, 5, "CHORE", ""))
+                {
+                    MetaphAdd('K');
+                    current +=2;
+                    break;
+                }
+
+                //germanic, greek, or otherwise 'ch' for 'kh' sound
+                if((StringAt(0, 4, "VAN ", "VON ", "") OR StringAt(0, 3, "SCH", ""))
+                    // 'architect but not 'arch', 'orchestra', 'orchid'
+                    OR StringAt((current - 2), 6, "ORCHES", "ARCHIT", "ORCHID", "")
+                    OR StringAt((current + 2), 1, "T", "S", "")
+                    OR ((StringAt((current - 1), 1, "A", "O", "U", "E", "") OR (current == 0))
+                    //e.g., 'wachtler', 'wechsler', but not 'tichner'
+                    AND StringAt((current + 2), 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", "")))
+                {
+                    MetaphAdd('K');
+                }else{
+                    if(current > 0)
+                    {
+                        if(StringAt(0, 2, "MC", ""))
+                            //e.g., "McHugh"
+                            MetaphAdd('K');
+                        else
+                            MetaphAdd("X", "K");
+                    }else
+                        MetaphAdd('X');
+                }
+                current +=2;
+                break;
+            }
+            //e.g, 'czerny'
+            if(StringAt(current, 2, "CZ", "") AND !StringAt((current - 2), 4, "WICZ", ""))
+            {
+                MetaphAdd("S", "X");
+                current += 2;
+                break;
+            }
+
+            //e.g., 'focaccia'
+            if(StringAt((current + 1), 3, "CIA", ""))
+            {
+                MetaphAdd('X');
+                current += 3;
+                break;
+            }
+
+            //double 'C', but not if e.g. 'McClellan'
+            if(StringAt(current, 2, "CC", "") AND !((current == 1) AND (GetAt(0) == 'M')))
+            {
+                //'bellocchio' but not 'bacchus'
+                if(StringAt((current + 2), 1, "I", "E", "H", "") AND !StringAt((current + 2), 2, "HU", ""))
+                {
+                    //'accident', 'accede' 'succeed'
+                    if(((current == 1) AND (GetAt(current - 1) == 'A'))
+                        OR StringAt((current - 1), 5, "UCCEE", "UCCES", ""))
+                        MetaphAdd("KS");
+                    //'bacci', 'bertucci', other italian
+                    else
+                        MetaphAdd('X');
+                    current += 3;
+                    break;
+                }else{//Pierce's rule
+                    MetaphAdd('K');
+                    current += 2;
+                    break;
+                }
+            }
+
+            if(StringAt(current, 2, "CK", "CG", "CQ", ""))
+            {
+                MetaphAdd('K');
+                current += 2;
+                break;
+            }
+
+            if(StringAt(current, 2, "CI", "CE", "CY", ""))
+            {
+                //italian vs. english
+                if(StringAt(current, 3, "CIO", "CIE", "CIA", ""))
+                    MetaphAdd("S", "X");
+                else
+                    MetaphAdd('S');
+                current += 2;
+                break;
+            }
+
+            //else
+            MetaphAdd('K');
+
+            //name sent in 'mac caffrey', 'mac gregor
+            if(StringAt((current + 1), 2, " C", " Q", " G", ""))
+                current += 3;
+            else
+                if(StringAt((current + 1), 1, "C", "K", "Q", "")
+                    AND !StringAt((current + 1), 2, "CE", "CI", ""))
+                    current += 2;
+                else
+                    current += 1;
+            break;
+
+        case 'D':
+            if(StringAt(current, 2, "DG", ""))
+            {
+                if(StringAt((current + 2), 1, "I", "E", "Y", ""))
+                {
+                    //e.g. 'edge'
+                    MetaphAdd('J');
+                    current += 3;
+                    break;
+                }else{
+                    //e.g. 'edgar'
+                    MetaphAdd("TK");
+                    current += 2;
+                    break;
+                }
+            }
+
+            if(StringAt(current, 2, "DT", "DD", ""))
+            {
+                MetaphAdd('T');
+                current += 2;
+                break;
+            }
+
+            //else
+            MetaphAdd('T');
+            current += 1;
+            break;
+
+       case 'F':
+            if(GetAt(current + 1) == 'F')
+                current += 2;
+            else
+                current += 1;
+            MetaphAdd('F');
+            break;
+
+        case 'G':
+            if(GetAt(current + 1) == 'H')
+            {
+                if((current > 0) AND !IsVowel(current - 1))
+                {
+                    MetaphAdd('K');
+                    current += 2;
+                    break;
+                }
+
+                if(current < 3)
+                {
+                    //'ghislane', ghiradelli
+                    if(current == 0)
+                    {
+                        if(GetAt(current + 2) == 'I')
+                            MetaphAdd('J');
+                        else
+                            MetaphAdd('K');
+                        current += 2;
+                        break;
+                    }
+                }
+                //Parker's rule (with some further refinements) - e.g., 'hugh'
+                if(((current > 1) AND StringAt((current - 2), 1, "B", "H", "D", "") )
+                    //e.g., 'bough'
+                    OR ((current > 2) AND StringAt((current - 3), 1, "B", "H", "D", "") )
+                    //e.g., 'broughton'
+                    OR ((current > 3) AND StringAt((current - 4), 1, "B", "H", "") ) )
+                {
+                    current += 2;
+                    break;
+                }else{
+                    //e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
+                    if((current > 2)
+                        AND (GetAt(current - 1) == 'U')
+                        AND StringAt((current - 3), 1, "C", "G", "L", "R", "T", "") )
+                    {
+                        MetaphAdd('F');
+                    }else
+                        if((current > 0) AND GetAt(current - 1) != 'I')
+                            MetaphAdd('K');
+
+                        current += 2;
+                        break;
+                }
+            }
+
+            if(GetAt(current + 1) == 'N')
+            {
+                if((current == 1) AND IsVowel(0) AND !SlavoGermanic())
+                {
+                    MetaphAdd("KN", "N");
+                }else
+                    //not e.g. 'cagney'
+                    if(!StringAt((current + 2), 2, "EY", "")
+                        AND (GetAt(current + 1) != 'Y') AND !SlavoGermanic())
+                    {
+                        MetaphAdd("N", "KN");
+                    }else
+                        MetaphAdd("KN");
+                    current += 2;
+                    break;
+            }
+
+            //'tagliaro'
+            if(StringAt((current + 1), 2, "LI", "") AND !SlavoGermanic())
+            {
+                MetaphAdd("KL", "L");
+                current += 2;
+                break;
+            }
+
+            //-ges-,-gep-,-gel-, -gie- at beginning
+            if((current == 0)
+                AND ((GetAt(current + 1) == 'Y')
+                OR StringAt((current + 1), 2, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER", "")) )
+            {
+                MetaphAdd("K", "J");
+                current += 2;
+                break;
+            }
+
+            // -ger-,  -gy-
+            if((StringAt((current + 1), 2, "ER", "") OR (GetAt(current + 1) == 'Y'))
+                AND !StringAt(0, 6, "DANGER", "RANGER", "MANGER", "")
+                AND !StringAt((current - 1), 1, "E", "I", "")
+                AND !StringAt((current - 1), 3, "RGY", "OGY", "") )
+            {
+                MetaphAdd("K", "J");
+                current += 2;
+                break;
+            }
+
+            // italian e.g, 'biaggi'
+            if(StringAt((current + 1), 1, "E", "I", "Y", "") OR StringAt((current - 1), 4, "AGGI", "OGGI", ""))
+            {
+                //obvious germanic
+                if((StringAt(0, 4, "VAN ", "VON ", "") OR StringAt(0, 3, "SCH", ""))
+                    OR StringAt((current + 1), 2, "ET", ""))
+                    MetaphAdd('K');
+                else
+                    //always soft if french ending
+                    if(StringAt((current + 1), 4, "IER ", ""))
+                        MetaphAdd('J');
+                    else
+                        MetaphAdd("J", "K");
+                    current += 2;
+                    break;
+            }
+
+            if(GetAt(current + 1) == 'G')
+                current += 2;
+            else
+                current += 1;
+            MetaphAdd('K');
+            break;
+
+        case 'H':
+            //only keep if first & before vowel or btw. 2 vowels
+            if(((current == 0) OR IsVowel(current - 1))
+                AND IsVowel(current + 1))
+            {
+                MetaphAdd('H');
+                current += 2;
+            }else//also takes care of 'HH'
+                current += 1;
+            break;
+
+        case 'J':
+            //obvious spanish, 'jose', 'san jacinto'
+            if(StringAt(current, 4, "JOSE", "") OR StringAt(0, 4, "SAN ", "") )
+            {
+                if(((current == 0) AND (GetAt(current + 4) == ' ')) OR StringAt(0, 4, "SAN ", "") )
+                    MetaphAdd('H');
+                else
+                {
+                    MetaphAdd("J", "H");
+                }
+                current +=1;
+                break;
+            }
+
+            if((current == 0) AND !StringAt(current, 4, "JOSE", ""))
+                MetaphAdd("J", "A");//Yankelovich/Jankelowicz
+            else
+                //spanish pron. of e.g. 'bajador'
+                if(IsVowel(current - 1)
+                    AND !SlavoGermanic()
+                    AND ((GetAt(current + 1) == 'A') OR (GetAt(current + 1) == 'O')))
+                    MetaphAdd("J", "H");
+                else
+                    if(current == last)
+                        MetaphAdd("J", " ");
+                    else
+                        if(!StringAt((current + 1), 1, "L", "T", "K", "S", "N", "M", "B", "Z", "")
+                            AND !StringAt((current - 1), 1, "S", "K", "L", ""))
+                            MetaphAdd('J');
+
+                        if(GetAt(current + 1) == 'J')//it could happen!
+                            current += 2;
+                        else
+                            current += 1;
+                        break;
+
+        case 'K':
+            if(GetAt(current + 1) == 'K')
+                current += 2;
+            else
+                current += 1;
+            MetaphAdd('K');
+            break;
+
+        case 'L':
+            if(GetAt(current + 1) == 'L')
+            {
+                //spanish e.g. 'cabrillo', 'gallegos'
+                if(((current == (length - 3))
+                    AND StringAt((current - 1), 4, "ILLO", "ILLA", "ALLE", ""))
+                    OR ((StringAt((last - 1), 2, "AS", "OS", "") OR StringAt(last, 1, "A", "O", ""))
+                    AND StringAt((current - 1), 4, "ALLE", "")) )
+                {
+                    MetaphAdd("L", " ");
+                    current += 2;
+                    break;
+                }
+                current += 2;
+            }else
+                current += 1;
+            MetaphAdd('L');
+            break;
+
+        case 'M':
+            if((StringAt((current - 1), 3, "UMB", "")
+                AND (((current + 1) == last) OR StringAt((current + 2), 2, "ER", "")))
+                //'dumb','thumb'
+                OR  (GetAt(current + 1) == 'M') )
+                current += 2;
+            else
+                current += 1;
+            MetaphAdd('M');
+            break;
+
+        case 'N':
+            if(GetAt(current + 1) == 'N')
+                current += 2;
+            else
+                current += 1;
+            MetaphAdd('N');
+            break;
+
+        case '\321': // Ascii 0xD1 = capital N with tilde
+            current += 1;
+            MetaphAdd('N');
+            break;
+
+        case 'P':
+            if(GetAt(current + 1) == 'H')
+            {
+                MetaphAdd('F');
+                current += 2;
+                break;
+            }
+
+            //also account for "campbell", "raspberry"
+            if(StringAt((current + 1), 1, "P", "B", ""))
+                current += 2;
+            else
+                current += 1;
+            MetaphAdd('P');
+            break;
+
+        case 'Q':
+            if(GetAt(current + 1) == 'Q')
+                current += 2;
+            else
+                current += 1;
+            MetaphAdd('K');
+            break;
+
+        case 'R':
+            //french e.g. 'rogier', but exclude 'hochmeier'
+            if((current == last)
+                AND !SlavoGermanic()
+                AND StringAt((current - 2), 2, "IE", "")
+                AND !StringAt((current - 4), 2, "ME", "MA", ""))
+                MetaphAdd("", "R");
+            else
+                MetaphAdd('R');
+
+            if(GetAt(current + 1) == 'R')
+                current += 2;
+            else
+                current += 1;
+            break;
+
+        case 'S':
+            //special cases 'island', 'isle', 'carlisle', 'carlysle'
+            if(StringAt((current - 1), 3, "ISL", "YSL", ""))
+            {
+                current += 1;
+                break;
+            }
+
+            //special case 'sugar-'
+            if((current == 0) AND StringAt(current, 5, "SUGAR", ""))
+            {
+                MetaphAdd("X", "S");
+                current += 1;
+                break;
+            }
+
+            if(StringAt(current, 2, "SH", ""))
+            {
+                //germanic
+                if(StringAt((current + 1), 4, "HEIM", "HOEK", "HOLM", "HOLZ", ""))
+                    MetaphAdd('S');
+                else
+                    MetaphAdd('X');
+                current += 2;
+                break;
+            }
+
+            //italian & armenian
+            if(StringAt(current, 3, "SIO", "SIA", "") OR StringAt(current, 4, "SIAN", ""))
+            {
+                if(!SlavoGermanic())
+                    MetaphAdd("S", "X");
+                else
+                    MetaphAdd('S');
+                current += 3;
+                break;
+            }
+
+            //german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
+            //also, -sz- in slavic language altho in hungarian it is pronounced 's'
+            if(((current == 0)
+                AND StringAt((current + 1), 1, "M", "N", "L", "W", ""))
+                OR StringAt((current + 1), 1, "Z", ""))
+            {
+                MetaphAdd("S", "X");
+                if(StringAt((current + 1), 1, "Z", ""))
+                    current += 2;
+                else
+                    current += 1;
+                break;
+            }
+
+            if(StringAt(current, 2, "SC", ""))
+            {
+                //Schlesinger's rule
+                if(GetAt(current + 2) == 'H')
+                {
+                    //dutch origin, e.g. 'school', 'schooner'
+                    if(StringAt((current + 3), 2, "OO", "ER", "EN", "UY", "ED", "EM", ""))
+                    {
+                        //'schermerhorn', 'schenker'
+                        if(StringAt((current + 3), 2, "ER", "EN", ""))
+                        {
+                            MetaphAdd("X", "SK");
+                        }else
+                            MetaphAdd("SK");
+                        current += 3;
+                        break;
+                    }else{
+                        if((current == 0) AND !IsVowel(3) AND (GetAt(3) != 'W'))
+                            MetaphAdd("X", "S");
+                        else
+                            MetaphAdd('X');
+                        current += 3;
+                        break;
+                    }
+                }
+
+                if(StringAt((current + 2), 1, "I", "E", "Y", ""))
+                {
+                    MetaphAdd('S');
+                    current += 3;
+                    break;
+                }
+                //else
+                MetaphAdd("SK");
+                current += 3;
+                break;
+            }
+
+            //french e.g. 'resnais', 'artois'
+            if((current == last) AND StringAt((current - 2), 2, "AI", "OI", ""))
+                MetaphAdd("", "S");
+            else
+                MetaphAdd('S');
+
+            if(StringAt((current + 1), 1, "S", "Z", ""))
+                current += 2;
+            else
+                current += 1;
+            break;
+
+        case 'T':
+            if(StringAt(current, 4, "TION", ""))
+            {
+                MetaphAdd('X');
+                current += 3;
+                break;
+            }
+
+            if(StringAt(current, 3, "TIA", "TCH", ""))
+            {
+                MetaphAdd('X');
+                current += 3;
+                break;
+            }
+
+            if(StringAt(current, 2, "TH", "")
+                OR StringAt(current, 3, "TTH", ""))
+            {
+                //special case 'thomas', 'thames' or germanic
+                if(StringAt((current + 2), 2, "OM", "AM", "")
+                    OR StringAt(0, 4, "VAN ", "VON ", "")
+                    OR StringAt(0, 3, "SCH", ""))
+                {
+                    MetaphAdd('T');
+                }else{
+                    MetaphAdd("0", "T");
+                }
+                current += 2;
+                break;
+            }
+
+            if(StringAt((current + 1), 1, "T", "D", ""))
+                current += 2;
+            else
+                current += 1;
+            MetaphAdd('T');
+            break;
+
+        case 'V':
+            if(GetAt(current + 1) == 'V')
+                current += 2;
+            else
+                current += 1;
+            MetaphAdd('F');
+            break;
+
+        case 'W':
+            //can also be in middle of word
+            if(StringAt(current, 2, "WR", ""))
+            {
+                MetaphAdd('R');
+                current += 2;
+                break;
+            }
+
+            if((current == 0)
+                AND (IsVowel(current + 1) OR StringAt(current, 2, "WH", "")))
+            {
+                //Wasserman should match Vasserman
+                if(IsVowel(current + 1))
+                    MetaphAdd("A", "F");
+                else
+                    //need Uomo to match Womo
+                    MetaphAdd('A');
+            }
+
+            //Arnow should match Arnoff
+            if(((current == last) AND IsVowel(current - 1))
+                OR StringAt((current - 1), 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY", "")
+                OR StringAt(0, 3, "SCH", ""))
+            {
+                MetaphAdd("", "F");
+                current +=1;
+                break;
+            }
+
+            //polish e.g. 'filipowicz'
+            if(StringAt(current, 4, "WICZ", "WITZ", ""))
+            {
+                MetaphAdd("TS", "FX");
+                current +=4;
+                break;
+            }
+
+            //else skip it
+            current +=1;
+            break;
+
+        case 'X':
+            //french e.g. breaux
+            if(!((current == last)
+                AND (StringAt((current - 3), 3, "IAU", "EAU", "")
+                OR StringAt((current - 2), 2, "AU", "OU", ""))) )
+                MetaphAdd("KS");
+
+            if(StringAt((current + 1), 1, "C", "X", ""))
+                current += 2;
+            else
+                current += 1;
+            break;
+
+        case 'Z':
+            //chinese pinyin e.g. 'zhao'
+            if(GetAt(current + 1) == 'H')
+            {
+                MetaphAdd('J');
+                current += 2;
+                break;
+            }else
+                if(StringAt((current + 1), 2, "ZO", "ZI", "ZA", "")
+                    OR (SlavoGermanic() AND ((current > 0) AND GetAt(current - 1) != 'T')))
+                {
+                    MetaphAdd("S", "TS");
+                }
+                else
+                    MetaphAdd('S');
+
+                if(GetAt(current + 1) == 'Z')
+                    current += 2;
+                else
+                    current += 1;
+                break;
+
+        default:
+            current += 1;
+        }
+    }
+
+    metaph = primary.Ptr;
+    //only give back 4 char metaph
+    //if(metaph.Len > 4)
+    //        metaph.SetAt(4,'\0');
+    metaph2 = secondary.Ptr;
+    //if(metaph2.Len > 4)
+    //        metaph2.SetAt(4,'\0');
+
+}
+
+}//namespace

+ 69 - 0
plugins/dmetaphone/metaphone.h

@@ -0,0 +1,69 @@
+#ifndef _METAPHONE_H
+#define _METAPHONE_H
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "platform.h"
+#include "dmetaphone.hpp"
+#include "cstring.h"
+#include <string.h>
+
+//#include <varargs.h>
+//#define false FALSE
+//#define true TRUE
+
+namespace nsDmetaphone {
+
+#ifdef _MSC_VER
+//Disable warnings about cString not matching the export specification of MString,
+//because all methods of cString are inline, so they will be compiled into each
+//plugin.
+#pragma warning(push)
+#pragma warning(disable: 4251 4275)
+#endif
+
+class DMETAPHONE_API MString : public cString
+{
+        int             length, last;
+        bool    alternate;
+        cString primary, secondary;
+
+public:
+        MString();
+        MString(const char*);
+        MString& operator =(const char *Str)
+        {
+            Set(Str);
+            return(*this);
+        }
+        MString(const cString&);
+        bool SlavoGermanic();
+        bool IsVowel(int at);
+        inline void MetaphAdd(const char* main);
+        inline void MetaphAdd(const char main);
+        inline void MetaphAdd(const char* main, const char* alt);
+        bool StringAt(int start, int length, ... );
+        void DoubleMetaphone(cString &metaph, cString &metaph2);
+        char GetAt( int x )
+        {
+            return Ptr[x];
+        }
+        bool Find(char c)
+        {
+            return (strchr(Ptr, c) != NULL);
+        }
+        bool Find(const char * str)
+        {
+            return (strstr(Ptr, str) != NULL);
+        }
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+}//namespace
+
+#endif

+ 6 - 0
testing/regress/ecl/key/metaphone.xml

@@ -0,0 +1,6 @@
+<Dataset name='Result 1'>
+ <Row><name>Algernon</name><d1>ALKRNN</d1><d2>ALJRNN</d2><db>ALKRNNALJRNN</db><d1_20>ALKRNN              </d1_20><d2_20>ALJRNN              </d2_20><db_40>ALKRNNALJRNN                            </db_40></Row>
+ <Row><name>Englebert</name><d1>ANKLPRT</d1><d2>ANKLPRT</d2><db>ANKLPRTANKLPRT</db><d1_20>ANKLPRT             </d1_20><d2_20>ANKLPRT             </d2_20><db_40>ANKLPRTANKLPRT                          </db_40></Row>
+ <Row><name>Cholmondley</name><d1>XLMNTL</d1><d2>XLMNTL</d2><db>XLMNTLXLMNTL</db><d1_20>XLMNTL              </d1_20><d2_20>XLMNTL              </d2_20><db_40>XLMNTLXLMNTL                            </db_40></Row>
+ <Row><name>Farquar</name><d1>FRKR</d1><d2>FRKR</d2><db>FRKRFRKR</db><d1_20>FRKR                </d1_20><d2_20>FRKR                </d2_20><db_40>FRKRFRKR                                </db_40></Row>
+</Dataset>

+ 47 - 0
testing/regress/ecl/metaphone.ecl

@@ -0,0 +1,47 @@
+/*##############################################################################
+
+    HPCC SYSTEMS software Copyright (C) 2015 HPCC Systems.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+############################################################################## */
+
+import Std.Metaphone, lib_metaphone;
+
+input := DATASET([
+  {'Algernon'},
+  {'Englebert'},
+  {'Cholmondley'},
+  {'Farquar'}
+], { string name});
+
+outrec := RECORD
+  STRING name;
+  STRING d1;
+  STRING d2;
+  STRING db;
+  STRING20 d1_20;
+  STRING20 d2_20;
+  STRING40 db_40;
+END;
+
+outrec t(string name) := TRANSFORM
+   SELF.name := name;
+   SELF.d1 := Metaphone.primary(name);
+   SELF.d2 := Metaphone.secondary(name);
+   SELF.db := Metaphone.double(name);
+   SELF.d1_20 := lib_metaphone.MetaphoneLib.DMetaphone1_20(name);
+   SELF.d2_20 := lib_metaphone.MetaphoneLib.DMetaphone2_20(name);
+   SELF.db_40  := lib_metaphone.MetaphoneLib.DMetaphoneBoth_40(name);
+END;
+
+output(PROJECT(input, t(LEFT.name)));