Browse Source

HPCC-8925 XMLDECODE decodes & to nothing

Various issues with the way XMLDECODE (and xml read) handle invalid escape
sequences.

1. XMLDECODE should be leave invalid escape sequences unchanged. At present
the unicode version strips them (sometimes) and the ascii version throws an
exception
2. Potential reads off the end of the input string in both cases
3. Incorrectly accepting upper-case variants of escape sequences
4. Unicode version was missing &nbsp. Strictly speaking that is an HTML escape
sequence not an XML one, but since the ascii version accepted it the unicode
one should too.

Signed-off-by: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 12 years ago
parent
commit
930f2fa953

+ 50 - 37
rtl/eclrtl/eclrtl.cpp

@@ -5164,14 +5164,17 @@ void appendUStr(MemoryBuffer & x, const char * text)
 
 ECLRTL_API void xmlDecodeStrX(size32_t & outLen, char * & out, size32_t inLen, const char * in)
 {
+    StringBuffer input(inLen, in);
     StringBuffer temp;
-    decodeXML(in, temp, inLen);
+    decodeXML(input, temp, NULL, NULL, false);
     outLen = temp.length();
     out = temp.detach();
 }
 
-bool hasPrefix(const UChar * ustr, const char * str, unsigned len)
+bool hasPrefix(const UChar * ustr, const UChar * end, const char * str, unsigned len)
 {
+    if (end - ustr < len)
+        return false;
     while (len--)
     {
         if (*ustr++ != *str++)
@@ -5190,69 +5193,79 @@ ECLRTL_API void xmlDecodeUStrX(size32_t & outLen, UChar * & out, size32_t inLen,
         switch(*cur)
         {
         case '&':
-            if(hasPrefix(cur+1, "amp;", 4))
+            if(hasPrefix(cur+1, end, "amp;", 4))
             {
                 cur += 4;
                 appendUChar(ret, '&');
             }
-            else if(hasPrefix(cur+1, "lt;", 3))
+            else if(hasPrefix(cur+1, end, "lt;", 3))
             {
                 cur += 3;
                 appendUChar(ret, '<');
             }
-            else if(hasPrefix(cur+1, "gt;", 3))
+            else if(hasPrefix(cur+1, end, "gt;", 3))
             {
                 cur += 3;
                 appendUChar(ret, '>');
             }
-            else if(hasPrefix(cur+1, "quot;", 5))
+            else if(hasPrefix(cur+1, end, "quot;", 5))
             {
                 cur += 5;
                 appendUChar(ret, '"');
             }
-            else if(hasPrefix(cur+1, "apos;", 5))
+            else if(hasPrefix(cur+1, end, "apos;", 5))
             {
                 cur += 5;
                 appendUChar(ret, '\'');
             }
-            else
+            else if(hasPrefix(cur+1, end, "nbsp;", 5))
+            {
+                cur += 5;
+                appendUChar(ret, (UChar) 0xa0);
+            }
+            else if(hasPrefix(cur+1, end, "#", 1))
             {
-                cur++;
-                if (*cur == '#')
+                const UChar * saveCur = cur;
+                bool error = true;  // until we have seen a digit...
+                cur += 2;
+                unsigned base = 10;
+                if (*cur == 'x')
                 {
+                    base = 16;
                     cur++;
-                    unsigned base = 10;
-                    if (*cur == 'x' || *cur == 'X') // strictly not sure about X.
-                    {
-                        base = 16;
-                        cur++;
-                    }
-                    UChar value = 0;
-                    while (cur < end)
+                }
+                UChar value = 0;
+                while (cur < end)
+                {
+                    unsigned digit;
+                    UChar next = *cur;
+                    if ((next >= '0') && (next <= '9'))
+                        digit = next-'0';
+                    else if ((next >= 'A') && (next <= 'F'))
+                        digit = next-'A'+10;
+                    else if ((next >= 'a') && (next <= 'f'))
+                        digit = next-'a'+10;
+                    else if (next==';')
+                        break;
+                    if (digit >= base)
                     {
-                        unsigned digit;
-                        UChar next = *cur;
-                        if ((next >= '0') && (next <= '9'))
-                            digit = next-'0';
-                        else if ((next >= 'A') && (next <= 'F'))
-                            digit = next-'A'+10;
-                        else if ((next >= 'a') && (next <= 'f'))
-                            digit = next-'a'+10;
-                        else
-                            break;
-                        if (digit >= base)
-                            break;
-                        value = value * base + digit;
-                        cur++;
+                        error = true;
+                        break;
                     }
-                    appendUChar(ret, value);
-
-                    //if (cur == end) || (*cur != ';') throw Error;
+                    error = false;
+                    value = value * base + digit;
+                    cur++;
+                }
+                if (error)
+                {
+                    appendUChar(ret, '&');
+                    cur = saveCur;
                 }
                 else
-                    appendUChar(ret, *cur);     // error... / unexpanded entity
+                    appendUChar(ret, value);
             }
-            //assertex(cur<end);
+            else
+                appendUChar(ret, *cur);
             break;
         default:
             appendUChar(ret, *cur);

+ 8 - 8
system/jlib/jptree.cpp

@@ -4125,7 +4125,7 @@ protected:
                                 ref.append(nextChar);
                             }
                             ref.append(";");
-                            decodeXML(ref, refValue, ref.length());
+                            decodeXML(ref, refValue);
                         }
                         else
                         {
@@ -4136,7 +4136,7 @@ protected:
                             {
                                 StringBuffer _ref("&");
                                 _ref.append(ref).append(';');
-                                decodeXML(ref, refValue, ref.length()); // try inbuilts
+                                decodeXML(ref, refValue); // try inbuilts
                             }
                         }
                     }
@@ -4349,10 +4349,10 @@ protected:
         }
         error("Bad comment syntax");
     }
-    const char *_decodeXML(unsigned read, const char *startMark, StringBuffer &ret, unsigned len)
+    const char *_decodeXML(unsigned read, const char *startMark, StringBuffer &ret)
     {
         const char *errMark = NULL;
-        try { return decodeXML(startMark, ret, len, &errMark, this); }
+        try { return decodeXML(startMark, ret, &errMark, this); }
         catch (IException *e)
         {
             if (errMark)
@@ -4565,7 +4565,7 @@ restart:
             else 
                 error();
 
-            _decodeXML(0, attrval.str(), tmpStr.clear(), attrval.length());
+            _decodeXML(0, attrval.str(), tmpStr.clear());
 
             if (0 == strcmp(attrName.str(), "@xsi:type") &&
                (0 == stricmp(tmpStr.str(),"SOAP-ENC:base64")))
@@ -4603,7 +4603,7 @@ restart:
                                 mark.setLength(l+1);
                             }
                             tagText.ensureCapacity(mark.length());
-                            _decodeXML(r, mark.toCharArray(), tagText, mark.length());
+                            _decodeXML(r, mark.toCharArray(), tagText);
                         }
                         readNext();
                         if ('!' == nextChar)
@@ -4899,7 +4899,7 @@ public:
                     else 
                         error();
 
-                    _decodeXML(0, attrval.str(), tmpStr.clear(), attrval.length());
+                    _decodeXML(0, attrval.str(), tmpStr.clear());
 
                     if (0 == strcmp(attrName.str(), "@xsi:type") &&
                        (0 == stricmp(tmpStr.str(),"SOAP-ENC:base64")))
@@ -4959,7 +4959,7 @@ public:
                                 }
                             }   
                             stateInfo->tagText.ensureCapacity(mark.length());
-                            _decodeXML(r, mark.toCharArray(), stateInfo->tagText, mark.length());
+                            _decodeXML(r, mark.toCharArray(), stateInfo->tagText);
                         }
                         if (endOfRoot && mark.length())
                         {

+ 88 - 139
system/jlib/jstring.cpp

@@ -1702,177 +1702,126 @@ void decodeXML(ISimpleReadStream &in, StringBuffer &out, unsigned len)
     UNIMPLEMENTED;
 }
 
-#define XMLSTRICT
-const char *decodeXML(const char *x, StringBuffer &ret, unsigned len, const char **errMark, IEntityHelper *entityHelper)
+const char *decodeXML(const char *x, StringBuffer &ret, const char **errMark, IEntityHelper *entityHelper, bool strict)
 {
     if (!x)
         return x;
-    if ((unsigned)-1 == len)
-        len = (unsigned)strlen(x);
-    const char *end = x+len;
     try
     {
-        while (x<end && *x)
+        while (*x)
         {
             if ('&' == *x)
             {
-                switch (*(x+1))
+                switch (x[1])
                 {
-                    case 'a':
-                    case 'A':
+                case 'a':
+                    switch (x[2])
                     {
-                        switch (*(x+2))
+                        case 'm':
                         {
-                            case 'm':
-                            case 'M':
+                            if ('p' == x[3] && ';' == x[4])
                             {
-                                char c1 = *(x+3);
-                                if (('p' == c1 || 'P' == c1) && ';' == *(x+4))
-                                {
-                                    x += 5;
-                                    ret.append('&');
-                                    continue;
-                                }
-                                break;
+                                x += 5;
+                                ret.append('&');
+                                continue;
                             }
-                            case 'p':
-                            case 'P':
+                            break;
+                        }
+                        case 'p':
+                        {
+                            if ('o' == x[3] && 's' == x[4] && ';' == x[5])
                             {
-                                char c1 = *(x+3);
-                                char c2 = *(x+4);
-                                if (('o' == c1 || 'O' == c1) && ('s' == c2 || 'S' == c2) && ';' == *(x+5))
-                                {
-                                    x += 6;
-                                    ret.append('\'');
-                                    continue;
-                                }
-                                break;
+                                x += 6;
+                                ret.append('\'');
+                                continue;
                             }
+                            break;
                         }
-                        break;
                     }
-                    case 'l':
-                    case 'L':
+                    break;
+                case 'l':
+                    if ('t' == x[2] && ';' == x[3])
                     {
-                        char c1 = *(x+2);
-                        if (('t' == c1 || 'T' == c1) && ';' == *(x+3))
-                        {
-                            x += 4;
-                            ret.append('<');
-                            continue;
-                        }
-                        break;
+                        x += 4;
+                        ret.append('<');
+                        continue;
                     }
-                    case 'g':
-                    case 'G':
+                    break;
+                case 'g':
+                    if ('t' == x[2] && ';' == x[3])
                     {
-                        char c1 = *(x+2);
-                        if (('t' == c1 || 'T' == c1) && ';' == *(x+3))
-                        {
-                            x += 4;
-                            ret.append('>');
-                            continue;
-                        }
-                        break;
+                        x += 4;
+                        ret.append('>');
+                        continue;
                     }
-                    case 'q':
-                    case 'Q':
+                    break;
+                case 'q':
+                    if ('u' == x[2] && 'o' == x[3] && 't' == x[4] && ';' == x[5])
                     {
-                        char c1 = *(x+2);
-                        char c2 = *(x+3);
-                        char c3 = *(x+4);
-                        if (('u' == c1 || 'U' == c1) && ('o' == c2 || 'O' == c2) && ('t' == c3 || 'T' == c3) && ';' == *(x+5))
-                        {
-                            x += 6;
-                            ret.append('"');
-                            continue;
-                        }
-                        break;
+                        x += 6;
+                        ret.append('"');
+                        continue;
                     }
-                    case 'n':
-                    case 'N':
+                    break;
+                case 'n':
+                    if ('b' == x[2] && 's' == x[3] && 'p' == x[4] && ';' == x[5])
                     {
-                        char c1 = *(x+2);
-                        char c2 = *(x+3);
-                        char c3 = *(x+4);
-                        if (('b' == c1 || 'B' == c1) && ('s' == c2 || 'S' == c2) && ('p' == c3 || 'P' == c3) && ';' == *(x+5))
-                        {
-                            x += 6;
-                            writeUtf8(0xa0, ret);
-                            continue;
-                        }
-                        break;
+                        x += 6;
+                        writeUtf8(0xa0, ret);
+                        continue;
                     }
-                    default:
+                    break;
+                case '#':
+                {
+                    const char *numstart = x+2;
+                    int base = 10;
+                    if (*numstart == 'x')
                     {
-                        x++;
-                        if (*x == '#')
-                        {
-                            x++;
-                            bool hex;
-                            if (*x == 'x' || *x == 'X') // strictly not sure about X.
-                            {
-                                hex = true;
-                                x++;
-                            }
-                            else
-                                hex = false;
-                            char *endptr;
-                            unsigned val = 0;
-                            if (hex)
-                                val = strtoul(x,&endptr,16);
-                            else
-                                val = strtoul(x,&endptr,10);
-                            if (x==endptr || *endptr != ';') {
-#ifndef XMLSTRICT
-                                LOG(MCerror, unknownJob, "&# syntax error");
-                                ret.append(*x);
-#endif
-                            }
-                            else // always convert to utf-8. Should potentially throw error if not marked as utf-8 encoded doc and out of ascii range.
-                                writeUtf8(val, ret);
-                            x = endptr+1;
-                            continue;
-                        }
-                        else
+                        base = 16;
+                        numstart++;
+                    }
+                    char *numend;
+                    unsigned val = strtoul(numstart, &numend, base);
+                    if (numstart==numend || *numend != ';')
+                    {
+                        if (strict)
+                            throw MakeStringException(-1, "invalid escaped sequence");
+                    }
+                    else // always convert to utf-8. Should potentially throw error if not marked as utf-8 encoded doc and out of ascii range.
+                    {
+                        writeUtf8(val, ret);
+                        x = numend+1;
+                        continue;
+                    }
+                    break;
+                }
+                case ';':
+                case '\0':
+                    if (strict)
+                        throw MakeStringException(-1, "invalid escaped sequence");
+                    break;
+                default:
+                    if (entityHelper)
+                    {
+                        bool error = false;
+                        const char *start=x+1;
+                        const char *finger=start;
+                        while (*finger && *finger != ';')
+                            ++finger;
+                        if (*finger == ';')
                         {
-                            if ('\0' == *x) 
-                                --x;
-                            else
+                            StringBuffer entity(finger-start, start);
+                            if (entityHelper->find(entity, ret))
                             {
-                                bool error = false;
-                                if (entityHelper)
-                                {
-                                    const char *start=x;
-                                    loop
-                                    {
-                                        ++x;
-                                        if ('\0' == *x) throw MakeStringException(-1, "missing ';'");
-                                        if (';' == *x) break;
-                                    }
-                                    StringBuffer entity(x-start, start);
-                                    if (!entityHelper->find(entity, ret))
-                                    {
-                                        error = true;
-                                        x = start;
-                                    }
-                                }
-                                else
-                                    error = true;
-                                if (error)
-                                {
-#ifdef XMLSTRICT
-                                    throw MakeStringException(-1, "invalid escaped sequence");
-#endif
-                                    ret.append('&');
-                                }
+                                x = finger + 1;
+                                continue;
                             }
                         }
-                        break;
                     }
+                    if (strict)
+                        throw MakeStringException(-1, "invalid escaped sequence");
+                    break;
                 }
-                if (x>=end)
-                    throw MakeStringException(-1, "invalid escaped sequence");
             }
             ret.append(*x);
             ++x;

+ 1 - 1
system/jlib/jstring.hpp

@@ -364,7 +364,7 @@ extern jlib_decl StringBuffer & appendStringAsQuotedECL(StringBuffer &out, unsig
 extern jlib_decl const char *decodeJSON(const char *x, StringBuffer &ret, unsigned len=(unsigned)-1, const char **errMark=NULL);
 extern jlib_decl void extractItem(StringBuffer & res, const char * src, const char * sep, int whichItem, bool caps);
 extern jlib_decl const char *encodeXML(const char *x, StringBuffer &ret, unsigned flags=0, unsigned len=(unsigned)-1, bool utf8=false);
-extern jlib_decl const char *decodeXML(const char *x, StringBuffer &ret, unsigned len=(unsigned)-1, const char **errMark=NULL, IEntityHelper *entityHelper=NULL);
+extern jlib_decl const char *decodeXML(const char *x, StringBuffer &ret, const char **errMark=NULL, IEntityHelper *entityHelper=NULL, bool strict = true);
 extern jlib_decl const char *encodeXML(const char *x, IIOStream &out, unsigned flags=0, unsigned len=(unsigned)-1, bool utf8=false);
 extern jlib_decl void decodeXML(ISimpleReadStream &in, StringBuffer &out, unsigned len=(unsigned)-1);
 

+ 90 - 0
testing/ecl/key/xmldecode.xml

@@ -0,0 +1,90 @@
+<Dataset name='Result 1'>
+ <Row><Result_1>Cats &amp; dogs</Result_1></Row>
+</Dataset>
+<Dataset name='Result 2'>
+ <Row><Result_2>&lt;html&gt;</Result_2></Row>
+</Dataset>
+<Dataset name='Result 3'>
+ <Row><Result_3>Cats &quot;love&quot; dog&apos;s dinner</Result_3></Row>
+</Dataset>
+<Dataset name='Result 4'>
+ <Row><Result_4>A is ascii for A</Result_4></Row>
+</Dataset>
+<Dataset name='Result 5'>
+ <Row><Result_5>B is ascii for B</Result_5></Row>
+</Dataset>
+<Dataset name='Result 6'>
+ <Row><Result_6>This ---&gt;&#194;&#160;&lt;---- is a non-breaking space</Result_6></Row>
+</Dataset>
+<Dataset name='Result 7'>
+ <Row><Result_7>Cats &amp; dogs</Result_7></Row>
+</Dataset>
+<Dataset name='Result 8'>
+ <Row><Result_8>&amp;amp</Result_8></Row>
+</Dataset>
+<Dataset name='Result 9'>
+ <Row><Result_9>&amp;nosuch</Result_9></Row>
+</Dataset>
+<Dataset name='Result 10'>
+ <Row><Result_10>&amp;#;</Result_10></Row>
+</Dataset>
+<Dataset name='Result 11'>
+ <Row><Result_11>&amp;#65 is not valid</Result_11></Row>
+</Dataset>
+<Dataset name='Result 12'>
+ <Row><Result_12>&amp;#d;</Result_12></Row>
+</Dataset>
+<Dataset name='Result 13'>
+ <Row><Result_13>&amp;AMP;</Result_13></Row>
+</Dataset>
+<Dataset name='Result 14'>
+ <Row><Result_14>&amp;;</Result_14></Row>
+</Dataset>
+<Dataset name='Result 15'>
+ <Row><Result_15>&amp;</Result_15></Row>
+</Dataset>
+<Dataset name='Result 16'>
+ <Row><Result_16>Cats &amp; dogs</Result_16></Row>
+</Dataset>
+<Dataset name='Result 17'>
+ <Row><Result_17>&lt;html&gt;</Result_17></Row>
+</Dataset>
+<Dataset name='Result 18'>
+ <Row><Result_18>Cats &quot;love&quot; dog&apos;s dinner</Result_18></Row>
+</Dataset>
+<Dataset name='Result 19'>
+ <Row><Result_19>A is ascii for A</Result_19></Row>
+</Dataset>
+<Dataset name='Result 20'>
+ <Row><Result_20>B is ascii for B</Result_20></Row>
+</Dataset>
+<Dataset name='Result 21'>
+ <Row><Result_21>This ---&gt; &lt;---- is a non-breaking space</Result_21></Row>
+</Dataset>
+<Dataset name='Result 22'>
+ <Row><Result_22>Cats &amp; dogs</Result_22></Row>
+</Dataset>
+<Dataset name='Result 23'>
+ <Row><Result_23>&amp;amp</Result_23></Row>
+</Dataset>
+<Dataset name='Result 24'>
+ <Row><Result_24>&amp;nosuch</Result_24></Row>
+</Dataset>
+<Dataset name='Result 25'>
+ <Row><Result_25>&amp;#;</Result_25></Row>
+</Dataset>
+<Dataset name='Result 26'>
+ <Row><Result_26>&amp;#65 is not valid</Result_26></Row>
+</Dataset>
+<Dataset name='Result 27'>
+ <Row><Result_27>&amp;#d;</Result_27></Row>
+</Dataset>
+<Dataset name='Result 28'>
+ <Row><Result_28>&amp;AMP;</Result_28></Row>
+</Dataset>
+<Dataset name='Result 29'>
+ <Row><Result_29>&amp;;</Result_29></Row>
+</Dataset>
+<Dataset name='Result 30'>
+ <Row><Result_30>&amp;</Result_30></Row>
+</Dataset>

+ 39 - 0
testing/ecl/xmldecode.ecl

@@ -0,0 +1,39 @@
+// Some non-error cases
+XMLDECODE('Cats &amp; dogs');
+XMLDECODE('&lt;html&gt;');
+XMLDECODE('Cats &quot;love&quot; dog&apos;s dinner');
+XMLDECODE('&#65; is ascii for A');
+XMLDECODE('&#x42; is ascii for B');
+XMLDECODE('This ---&gt;&nbsp;&lt;---- is a non-breaking space');
+
+// Some error cases - leave the original unchanged
+XMLDECODE('Cats & dogs');
+XMLDECODE('&amp');  // missing ;
+XMLDECODE('&nosuch');  // missing ; on unknown entity
+XMLDECODE('&#;');  // missing digits
+XMLDECODE('&#65 is not valid');  // missing semi
+XMLDECODE('&#d;');  // invalid digits
+XMLDECODE(u'&AMP;');  // invalid entity
+XMLDECODE('&;');  // missing contents
+XMLDECODE('&');  // missing contents
+
+// Now the same in unicode
+
+// Some non-error cases
+XMLDECODE(u'Cats &amp; dogs');
+XMLDECODE(u'&lt;html&gt;');
+XMLDECODE(u'Cats &quot;love&quot; dog&apos;s dinner');
+XMLDECODE(u'&#65; is ascii for A');
+XMLDECODE(u'&#x42; is ascii for B');
+XMLDECODE(u'This ---&gt;&nbsp;&lt;---- is a non-breaking space');
+
+// Some error cases - leave the original unchanged
+XMLDECODE(u'Cats & dogs');
+XMLDECODE(u'&amp');  // missing ;
+XMLDECODE(u'&nosuch');  // missing ; on unknown entity
+XMLDECODE(u'&#;');  // missing digits
+XMLDECODE(u'&#65 is not valid');  // missing semi
+XMLDECODE(u'&#d;');  // invalid digits
+XMLDECODE(u'&AMP;');  // invalid entity
+XMLDECODE(u'&;');  // missing contents
+XMLDECODE(u'&');  // missing contents