Просмотр исходного кода

HPCC-17914 Optimize the performance for WildMatch

Signed-off-by: Gavin Halliday <gavin.halliday@lexisnexis.com>
Gavin Halliday 8 лет назад
Родитель
Сommit
ff49eee069
3 измененных файлов с 299 добавлено и 57 удалено
  1. 136 56
      system/jlib/jregexp.cpp
  2. 1 1
      system/jlib/jregexp.hpp
  3. 162 0
      testing/unittests/jlibtests.cpp

+ 136 - 56
system/jlib/jregexp.cpp

@@ -1225,79 +1225,159 @@ static char *SoundexCode(const char *s,int l,char *res)
   return res;
 }
 
-static bool WildMatchN ( const char *src, int srclen, int srcidx,
-                    const char *pat, int patlen, int patidx,int nocase)
+//---------------------------------------------------------------------------------------------------------------------
+
+inline bool matches(char cur, char next, bool nocase)
 {
-  char next_char;
-  for (;;) {
-    if (patidx == patlen)
-       return (srcidx == srclen);
-    next_char = pat[patidx++];
-    if (next_char == '?') {
-      if (srcidx == srclen)
-        return false;
-      srcidx++;
-    }
-    else if (next_char != '*') {
-      if (nocase) {
-        if ((srcidx == srclen) ||
-           (toupper(src[srcidx])!=toupper(next_char)))
-          return false;
-      }
-      else
-        if ((srcidx == srclen) || (src[srcidx]!=next_char))
-          return false;
-      srcidx++;
+    return (nocase ? (toupper(cur)==toupper(next)) : cur == next);
+}
+
+/* Search for a pattern pat anywhere within the search string src */
+static bool WildSubStringMatch(const char *src, size_t srclen, const char *pat, size_t patlen, bool nocase)
+{
+    //On entry the pattern to match contains at least one leading non '*' character
+    char pat0 = pat[0];
+    if (nocase)
+        pat0 = toupper(pat[0]);
+
+    //Could special case '?' at the start of the string, but fairly unlikely.
+    for (size_t srcdelta=0; srcdelta < srclen; srcdelta++)
+    {
+        size_t patidx=0;
+        size_t srcidx = srcdelta;
+        if (likely(pat0 != '?'))
+        {
+            //Quick scan to find a match for the first character
+            if (!nocase)
+            {
+                for (;;)
+                {
+                    if (unlikely(src[srcdelta] == pat0))
+                        break;
+                    srcdelta++;
+                    if (unlikely(srcdelta == srclen))
+                        return false;
+                }
+            }
+            else
+            {
+                for (;;)
+                {
+                    if (unlikely(toupper(src[srcdelta]) == pat0))
+                        break;
+                    srcdelta++;
+                    if (unlikely(srcdelta == srclen))
+                        return false;
+                }
+            }
+            patidx=1;
+            srcidx = srcdelta+1;
+        }
+        for (;;)
+        {
+            if (patidx == patlen)
+                return true;
+
+            char next = pat[patidx];
+            if (next == '*')
+            {
+                do
+                {
+                    patidx++;
+                }
+                while ((patidx < patlen) && (pat[patidx] == '*'));
+                dbgassertex((patidx != patlen)); // pattern should never finish with a '*'
+                if (WildSubStringMatch(src+srcidx, srclen-srcidx, pat+patidx, patlen-patidx, nocase))
+                    return true;
+                break; // retry at next position
+            }
+            if (srcidx == srclen)
+                break; // retry at next position
+            if (next != '?')
+            {
+                char cur = src[srcidx];
+                if (!matches(cur, next, nocase))
+                    break; // retry at next position
+            }
+            patidx++;
+            srcidx++;
+        }
     }
-    else {
-        for (;;) {
-        if (patidx == patlen)
-          return true;
-        if (pat[patidx] != '*')
+    return false;
+}
+
+static bool WildMatchN ( const char *src, size_t srclen, size_t srcidx,
+                    const char *pat, size_t patlen, size_t patidx, bool nocase)
+{
+    //First check for matching prefix
+    char next;
+    while (patidx < patlen)
+    {
+        next = pat[patidx];
+        if (next == '*')
             break;
+        if (srcidx >= srclen)
+            return false;
+        if (next != '?')
+        {
+            if (!matches(src[srcidx], next, nocase))
+                return false;
+        }
+        srcidx++;
         patidx++;
-      }
-        for (;;) {
-        //No need to guard patLen since guaranteed to contain an ASTERISK
-        const char tail_char = pat[patlen-1];
-        if (tail_char == '*')
+    }
+
+    //Now check for matching suffix
+    while (patidx < patlen)
+    {
+        next = pat[patlen-1];
+        if (next == '*')
             break;
-        if (srcidx == srclen)
+        if (srcidx >= srclen)
             return false;
-        if (tail_char != '?') {
-          if (nocase) {
-            if ((toupper(tail_char)!=toupper(src[srclen-1])))
-              return false;
-          }
-          else {
-            if ((tail_char!=src[srclen-1]))
-              return false;
-          }
+        if (next != '?')
+        {
+            if (!matches(src[srclen-1], next, nocase))
+                return false;
         }
-        patlen--;
         srclen--;
-        if (patidx == patlen)
-            return true;
-      }
-      while (srcidx < srclen) {
-        if (WildMatchN(src,srclen,srcidx,
-                     pat, patlen, patidx,nocase))
-           return true;
-        srcidx++;
-      }
-      return false;
+        patlen--;
     }
-  }
+
+    //String contains no wildcards...
+    if (patidx == patlen)
+        return (srcidx == srclen);
+
+    dbgassertex(pat[patidx] == '*');
+    dbgassertex(pat[patlen-1] == '*');
+
+    //Skip multiple wildcards on the prefix and suffix.
+    while (patidx < patlen && pat[patidx] == '*')
+        patidx++;
+    while (patidx < patlen && pat[patlen-1] == '*')
+        patlen--;
+
+    //abc*def
+    if (patidx == patlen)
+        return true;
+
+    //Must match at least one character, if no characters left in the search string, then it fails to match
+    if (srcidx == srclen)
+        return false;
+
+    //Search for the remaining pattern at an arbitrary position with the search string
+    return WildSubStringMatch(src+srcidx, srclen-srcidx, pat+patidx, patlen-patidx, nocase);
 }
 
-bool jlib_decl WildMatch(const char *src, int srclen, const char *pat, int patlen,bool nocase)
+bool jlib_decl WildMatch(const char *src, size_t srclen, const char *pat, size_t patlen, bool nocase)
 {
   return WildMatchN(src,srclen,0,pat,patlen,0,nocase);
 }
 
 bool jlib_decl WildMatch(const char *src, const char *pat, bool nocase)
 {
-    return WildMatch(src,(size32_t)strlen(src),pat,(size32_t)strlen(pat),nocase);
+    //This could match constant prefixes before calling strlen(), but unlikely to be very significant
+    return WildMatchN(src, strlen(src), 0, pat, strlen(pat), 0, nocase);
 }
 
 bool jlib_decl containsWildcard(const char * pattern)

+ 1 - 1
system/jlib/jregexp.hpp

@@ -99,7 +99,7 @@ inline bool isWildString(const char *s)
     return false;
 }
 
-bool jlib_decl WildMatch(const char *src, int srclen, const char *pat, int patlen,bool nocase);
+bool jlib_decl WildMatch(const char *src, size_t srclen, const char *pat, size_t patlen,bool nocase);
 bool jlib_decl WildMatch(const char *src, const char *pat, bool nocase=false);
 bool jlib_decl WildMatchReplace(const char *src, const char *pat, const char *repl, bool nocase, StringBuffer &out);
 bool jlib_decl SoundexMatch(const char *src, const char *pat);

+ 162 - 0
testing/unittests/jlibtests.cpp

@@ -21,12 +21,14 @@
  */
 
 #ifdef _USE_CPPUNIT
+#include <memory>
 #include "jsem.hpp"
 #include "jfile.hpp"
 #include "jdebug.hpp"
 #include "jset.hpp"
 #include "sockfile.hpp"
 #include "jqueue.hpp"
+#include "jregexp.hpp"
 
 #include "unittests.hpp"
 
@@ -1009,4 +1011,164 @@ protected:
 CPPUNIT_TEST_SUITE_REGISTRATION(JlibReaderWriterTestTiming);
 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(JlibReaderWriterTestTiming, "JlibReaderWriterTestTiming");
 
+/* =========================================================== */
+
+class JlibWildMatchBase : public CppUnit::TestFixture
+{
+protected:
+    void testSet(unsigned length, const char * const * patterns, bool reportTiming)
+    {
+        std::unique_ptr<char[]> search(generateSearchString(length));
+        CCycleTimer timer;
+        testPatterns(search.get(), patterns);
+        if (reportTiming)
+            printf("%u: %u ms\n", length, timer.elapsedMs());
+    }
+
+    char * generateSearchString(size_t len)
+    {
+        char * target = new char[len+1];
+        fillSearchString(target, len);
+        target[len] = 0;
+        return target;
+    }
+
+    void fillSearchString(char * target, size_t len)
+    {
+        for (unsigned repeat=0; ; repeat++)
+        {
+            for (unsigned char fill = 'a'; fill <= 'z'; fill++)
+            {
+                for (unsigned i=0; i < repeat; i++)
+                {
+                    *target++ = fill;
+                    if (--len == 0)
+                        return;
+                }
+            }
+        }
+    }
+
+    void testPatterns(const char * search, const char * const * patterns)
+    {
+        for (const char * const * cur = patterns; *cur; cur++)
+        {
+            const char * pattern = *cur;
+            bool expected = true;
+            bool nocase = false;
+            if (*pattern == '!')
+            {
+                expected = false;
+                pattern++;
+            }
+            if (*pattern == '~')
+            {
+                nocase = true;
+                pattern++;
+            }
+            bool evaluated = WildMatch(search, pattern, nocase);
+            CPPUNIT_ASSERT_EQUAL_MESSAGE(pattern, expected, evaluated);
+        }
+    }
+};
+
+const char * const patterns10 [] = {
+        "!a",
+        "abcdefghij",
+        "??????????",
+        "?*c?*e*",
+        "!??*b?*h*",
+        "a*",
+        "*j",
+        "a*j",
+        "a**j",
+        "a***************j",
+        "abcde*fghij",
+        "!abcde*?*fghij",
+        "*a*j*",
+        "*a*c*e*g*j*",
+        "a?c?e?g??j",
+        "a?c?e?g?*?j",
+        "!~A",
+        "!A*",
+        "~A*",
+        "~*J",
+        "~A*J",
+        "~A**J",
+        "~A***************J",
+        "~*A*J*",
+        "~*A*C*E*G*J*",
+        "~*A*B*C*D*E*F*G*H*I*J*",
+        "~*A*?*?*?*J*",
+        "~*A*?C*?E*?*J*",
+        "~*A*C?*E?*?*J*",
+        "!~*A*.B*C*D*E*F*G*H*I*J*",
+        nullptr
+};
+
+const char * const patterns100 [] = {
+        "a*",
+        "*h",
+        "a*h",
+        "a**h",
+        "a***************h",
+        "*a*j*",
+        "*a*c*e*g*j*",
+        "!a*jj*fff",
+        "!a*jj*zzz",
+        "a*jj*fff*",
+        "*aa*jj*fff*",
+        "!a*jj*zy*",
+        nullptr
+};
+
+const char * const patternsLarge [] = {
+        "!*a*zy*",
+        "a*",
+        "a*h*",
+        "!a*jj*ab",
+        "!a*jj*zy",
+        "a*jj*fff*",
+        "!a*jj*zy*",
+/*        "!a*c*e*g*i*k*zy*", will completely destroy the performance*/
+        nullptr
+};
+
+class JlibWildMatchCore : public JlibWildMatchBase
+{
+    CPPUNIT_TEST_SUITE(JlibWildMatchCore);
+        CPPUNIT_TEST(testWildMatch);
+    CPPUNIT_TEST_SUITE_END();
+
+public:
+    void testWildMatch()
+    {
+        testSet(10, patterns10, false);
+        testSet(100, patterns100, false);
+        testSet(1000, patternsLarge, false);
+    }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(JlibWildMatchCore);
+CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(JlibWildMatchCore, "JlibWildMatchCore");
+
+
+class JlibWildMatchTiming : public JlibWildMatchBase
+{
+    CPPUNIT_TEST_SUITE(JlibWildMatchTiming);
+        CPPUNIT_TEST(testWildMatch);
+    CPPUNIT_TEST_SUITE_END();
+
+public:
+    void testWildMatch()
+    {
+        testSet(10000, patternsLarge, true);
+        testSet(100000, patternsLarge, true);
+        testSet(1000000, patternsLarge, true);
+    }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(JlibWildMatchTiming);
+CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(JlibWildMatchTiming, "JlibWildMatchTiming");
+
 #endif // _USE_CPPUNIT