瀏覽代碼

Merge pull request #9522 from richardkchapman/multiline

HPCC-16900 Add multi-line string constants to ECL

Reviewed-by: Gavin Halliday <ghalliday@hpccsystems.com>
Gavin Halliday 8 年之前
父節點
當前提交
3230c5881d
共有 5 個文件被更改,包括 343 次插入185 次删除
  1. 1 0
      ecl/hql/hqlgram.hpp
  2. 62 185
      ecl/hql/hqllex.l
  3. 200 0
      ecl/hql/hqlparse.cpp
  4. 44 0
      testing/regress/ecl/key/strings.xml
  5. 36 0
      testing/regress/ecl/strings.ecl

+ 1 - 0
ecl/hql/hqlgram.hpp

@@ -1224,6 +1224,7 @@ class HqlLex
         bool getDefinedParameter(StringBuffer &curParam, YYSTYPE & returnToken, const char* for_what, SharedHqlExpr & resolved);
 
         bool checkUnicodeLiteral(char const * str, unsigned length, unsigned & ep, StringBuffer & msg);
+        int processStringLiteral(YYSTYPE & returnToken, char *CUR_TOKEN_TEXT, unsigned CUR_TOKEN_LENGTH, int oldColumn, int oldPosition);
 
         bool readCheckNextToken(YYSTYPE & returnToken, int expected, unsigned errCode, const char * msg);
 

+ 62 - 185
ecl/hql/hqllex.l

@@ -256,6 +256,7 @@ xpathseq      ([^}\r\n])+
 
 %x COMMENT
 %x CPP
+%x MULTISTRING
 %x SLSL
 %x SLSLHASH
 %x PGPHEADER
@@ -1527,7 +1528,7 @@ FUNCTIONMACRO|MACRO {
                         else
                             startpos += 1;  // Skip the ) of EMBED(xxx)
 
-                        // keep the orginal format info (like blanks, newlines)
+                        // keep the original format info (like blanks, newlines)
                         while (endpos != startpos && (lexer->yyBuffer[endpos-1] == 13 || lexer->yyBuffer[endpos-1] == 10))
                             endpos--;
                         int len = endpos-startpos;
@@ -1548,6 +1549,64 @@ FUNCTIONMACRO|MACRO {
 <CPP>[^\n]+         { updatepos1; }
 <CPP>\n             { updatepos1; lexer->updateNewline(); }
 
+(d|D|q|Q|v|V|u|U|u8|U8)?"'''"  { 
+                        setupdatepos; 
+                        BEGIN(MULTISTRING);
+                        lexer->inCpp = true;
+                    }
+<MULTISTRING>[^\n]*"'''"[^\n]*  {
+                        lexer->inCpp = false;
+                        int endpos = lexer->yyPosition;
+                        //skip to the position of ''' on the line)
+                        while (memcmp(lexer->yyBuffer+endpos, "'''", 3) != 0)
+                            endpos++;
+                        const int lastpos = endpos + 3;
+                        
+                        updatepos1; 
+                        BEGIN(0);
+
+                        //Return any characters found after the closing '''
+                        unsigned delta = lexer->yyPosition - lastpos;
+                        yyless(CUR_TOKEN_LENGTH - delta);
+                        lexer->yyPosition -= delta;
+                        lexer->yyColumn -= delta;
+                            
+                        int startpos = returnToken.pos.position;
+                        if (lexer->yyBuffer[startpos] == 'u')
+                        {
+                            bool isUtf8 = false;
+                            startpos++;
+                            if (lexer->yyBuffer[startpos]=='8')
+                            {
+                                isUtf8 = true;
+                                startpos++;
+                            }
+                            startpos +=3;
+                            int len = endpos-startpos;
+                            Owned<IValue> unicodeValue;
+                            // Special handling required for trailing \ char which suppresses the following linefeed, as unicode unescape does not handle it
+                            StringBuffer source(len, lexer->yyBuffer+startpos);
+                            source.replaceString("\\\n","");
+                            if (isUtf8)
+                            {
+                                size32_t chars = rtlUtf8Length(source.length(), source.str());
+                                unicodeValue.setown(createUtf8Value(chars, source.str(), "", true));
+                            }
+                            else
+                                unicodeValue.setown(createUnicodeValue(source.str(), source.length(), "", true, true));
+                            
+                            returnToken.setExpr(createConstant(unicodeValue.getClear()));
+                            return (UNICODE_CONST);
+                        }
+                        else
+                        {
+                            return lexer->processStringLiteral(returnToken, lexer->yyBuffer+startpos, lastpos-startpos, returnToken.pos.column, returnToken.pos.position);
+                        }
+                    }
+<MULTISTRING>[^\n]+         { updatepos1; }
+<MULTISTRING>\n             { updatepos1; lexer->updateNewline(); }
+
+
 "<)"                { setupdatepos; return(TYPE_RPAREN) ; }
 "(>"                { setupdatepos; return(TYPE_LPAREN) ; }
 "<=>"               { setupdatepos; return(ORDER) ; }
@@ -1698,190 +1757,8 @@ FUNCTIONMACRO|MACRO {
 (d|D|q|Q|v|V)?\'([^'\r\n\\]|\\[^\r\n])*\' {
                         int oldColumn = lexer->yyColumn;
                         int oldPosition = lexer->yyPosition;
-                        setupdatepos; 
-                        MemoryAttr tempBuff;
-                        char *b = (char *)tempBuff.allocate(CUR_TOKEN_LENGTH); // Escape sequence can only make is shorter...
-                        char *bf = b;
-                        const char *finger = CUR_TOKEN_TEXT;
-                        type_t tc = type_string;
-                        if (*finger != '\'')
-                        {
-                            if ((*finger == 'd') || (*finger == 'D'))
-                                tc = type_data;
-                            else if((*finger == 'q') || (*finger == 'Q'))
-                                tc = type_qstring;
-                            else if((*finger == 'v') || (*finger == 'V'))
-                                tc = type_varstring;
-                            finger++;
-                        }
-                        for (finger++; finger[1]; finger++)
-                        {
-                            unsigned char next = *finger;
-                            size32_t delta = (size32_t)(finger-CUR_TOKEN_TEXT);
-                            if (next == '\\')
-                            {
-                                next = finger[1];
-                                if (finger[2]==0)  // finger[1] must be '.
-                                {
-                                    assertex(false);
-                                    returnToken.setPosition(lexer->yyLineNo, oldColumn+delta, oldPosition+delta, lexer->querySourcePath());
-                                    StringBuffer msg("Can not terminate a string with escape char '\\': ");
-                                    msg.append(CUR_TOKEN_TEXT);
-                                    lexer->reportError(returnToken, RRR_ESCAPE_ENDWITHSLASH, "%s", msg.str());
-                                    if (lexer->checkAborting())
-                                        return EOF;
-                                }
-                                else if (next == '\'' || next == '\\' || next == '?' || next == '"') 
-                                {
-                                    finger++;
-                                } 
-                                else if (next == 'a') 
-                                {
-                                    next = '\a';
-                                    finger++;
-                                } 
-                                else if (next == 'b') 
-                                {
-                                    next = '\b';
-                                    finger++;
-                                } 
-                                else if (next == 'f') 
-                                {
-                                    next = '\f';
-                                    finger++;
-                                } 
-                                else if (next == 'n') 
-                                {
-                                    next = '\n';
-                                    finger++;
-                                } 
-                                else if (next == 'r') 
-                                {
-                                    next = '\r';
-                                    finger++;
-                                } 
-                                else if (next == 't') 
-                                {
-                                    next = '\t';
-                                    finger++;
-                                } 
-                                else if (next == 'v') 
-                                {
-                                    next = '\v';
-                                    finger++;
-                                } 
-                                else if (isdigit(next) && next < '8')
-                                {
-                                    //Allow octal constants for ^Z etc.
-                                    unsigned value = 0;
-                                    unsigned count;
-                                    for (count=0; count < 3; count++)
-                                    {
-                                        next = finger[count+1];
-                                        if (!isdigit(next) || next >= '8')
-                                            break;
-                                        value = value * 8 + (next - '0');
-                                    }
-                                    if(count != 3)
-                                    {
-                                        returnToken.setPosition(lexer->yyLineNo, oldColumn+delta, oldPosition+delta, lexer->querySourcePath());
-                                        StringBuffer msg;
-                                        msg.append("3-digit numeric escape sequence contained non-octal digit: ").append(next);
-                                        lexer->reportError(returnToken, ERR_ESCAPE_UNKNOWN, "%s", msg.str());
-                                        if (lexer->checkAborting())
-                                            return EOF;
-                                    }
-                                    *bf++ = value;
-                                    if(!(isValidAsciiLikeCharacter(value) || (tc == type_data)))
-                                    {
-                                        returnToken.setPosition(lexer->yyLineNo, oldColumn+delta, oldPosition+delta, lexer->querySourcePath());
-                                        lexer->reportWarning(CategoryCast, returnToken, ERR_STRING_NON_ASCII, "Character in string literal is not defined in encoding " ASCII_LIKE_CODEPAGE);
-                                        if (lexer->checkAborting())
-                                            return EOF;
-                                    }
-                                    finger += count;
-                                    continue;
-                                }
-                                else
-                                {
-                                    StringBuffer msg;
-                                    msg.append("Unrecognized escape sequence: ");
-                                    msg.append("\\").append(finger[1]);
-                                    returnToken.setPosition(lexer->yyLineNo, oldColumn+delta, oldPosition+delta, lexer->querySourcePath());
-                                    lexer->reportError(returnToken, ERR_ESCAPE_UNKNOWN, "%s", msg.str());
-                                    if (lexer->checkAborting())
-                                        return EOF;
-                                }
-                                *bf++ = next;
-                            }
-                            else if (next == '\'')
-                            {
-                                returnToken.setPosition(lexer->yyLineNo, oldColumn+delta, oldPosition+delta, lexer->querySourcePath());
-                                lexer->reportError(returnToken, ERR_STRING_NEEDESCAPE,"' needs to be escaped by \\ inside string");
-                                if (lexer->checkAborting())
-                                    return EOF;
-                            }
-                            else if (next >= 128)
-                            {
-                                const byte * temp = (byte *)finger;
-                                unsigned lenLeft = CUR_TOKEN_LENGTH - (size32_t)(finger - CUR_TOKEN_TEXT);
-                                int extraCharsRead = rtlSingleUtf8ToCodepage(bf, lenLeft, finger, ASCII_LIKE_CODEPAGE);
-                                if (extraCharsRead == -1)
-                                {
-                                    //This really has to be an error, otherwise it will work most of the time, but will then sometimes fail
-                                    //because two characters > 128 are next to each other.
-                                    returnToken.setPosition(lexer->yyLineNo, oldColumn+delta, oldPosition+delta, lexer->querySourcePath());
-                                    lexer->reportError(returnToken, ERR_STRING_NON_ASCII, "Character in string literal is not legal UTF-8");
-                                    if (lexer->checkAborting())
-                                        return EOF;
-                                    *bf = next;
-                                }
-                                else
-                                {
-                                    if (*bf == ASCII_LIKE_SUBS_CHAR)
-                                    {
-                                        returnToken.setPosition(lexer->yyLineNo, oldColumn+delta, oldPosition+delta, lexer->querySourcePath());
-                                        lexer->reportWarning(CategoryCast, returnToken, ERR_STRING_NON_ASCII, "Character in string literal is not defined in encoding " ASCII_LIKE_CODEPAGE ", try using a unicode constant");
-                                    }
-                                    finger += extraCharsRead;
-                                }
-                                bf++;
-                            }
-                            else
-                            {
-                                *bf++ = next;
-                                if(!(isValidAsciiLikeCharacter(next) || (tc == type_data)))
-                                {
-                                    returnToken.setPosition(lexer->yyLineNo, oldColumn+delta, oldPosition+delta, lexer->querySourcePath());
-                                    lexer->reportError(returnToken, ERR_STRING_NON_ASCII, "Character in string literal is not defined in encoding " ASCII_LIKE_CODEPAGE);
-                                    if (lexer->checkAborting())
-                                        return EOF;
-                                }
-                            }
-                        }
-                        returnToken.setPosition(lexer->yyLineNo, oldColumn, oldPosition, lexer->querySourcePath());
-                        switch (tc)
-                        {
-                        case type_qstring:
-                            {
-                                Owned<ITypeInfo> qStrType = makeQStringType(UNKNOWN_LENGTH); 
-                                returnToken.setExpr(createConstant(qStrType->castFrom((size32_t)(bf-b), b)));
-                                return (DATA_CONST);
-                            }
-                        case type_data:
-                            {
-                                returnToken.setExpr(createConstant(createDataValue(b, (size32_t)(bf-b))));
-                                return (DATA_CONST);
-                            }
-                        case type_varstring:
-                            {
-                                returnToken.setExpr(createConstant(createVarStringValue((size32_t)(bf-b), b, makeVarStringType(UNKNOWN_LENGTH))));
-                                return (DATA_CONST);
-                            }
-                        case type_string:
-                            returnToken.setExpr(createConstant(createStringValue(b, (size32_t)(bf-b))));
-                            return (STRING_CONST);
-                        }
+                        setupdatepos;
+                        return lexer->processStringLiteral(returnToken, CUR_TOKEN_TEXT, CUR_TOKEN_LENGTH, oldColumn, oldPosition);
                     }
 
 (d|D|q|Q|u|U|v|V)?\'([^'\r\n\\]|\\[^\r\n])*(\\)? {  

+ 200 - 0
ecl/hql/hqlparse.cpp

@@ -28,6 +28,7 @@
 
 #define YY_NO_UNISTD_H
 #include "hqllex.hpp"
+#include "eclrtl.hpp"
 
 //#define TIMING_DEBUG
 
@@ -2129,6 +2130,205 @@ bool HqlLex::checkUnicodeLiteral(char const * str, unsigned length, unsigned & e
     return true;
 }
 
+int HqlLex::processStringLiteral(YYSTYPE & returnToken, char *CUR_TOKEN_TEXT, unsigned CUR_TOKEN_LENGTH, int oldColumn, int oldPosition)
+{
+    MemoryAttr tempBuff;
+    char *b = (char *)tempBuff.allocate(CUR_TOKEN_LENGTH); // Escape sequence can only make is shorter...
+    char *bf = b;
+    const char *finger = CUR_TOKEN_TEXT;
+    type_t tc = type_string;
+    if (*finger != '\'')
+    {
+        if ((*finger == 'd') || (*finger == 'D'))
+            tc = type_data;
+        else if((*finger == 'q') || (*finger == 'Q'))
+            tc = type_qstring;
+        else if((*finger == 'v') || (*finger == 'V'))
+            tc = type_varstring;
+        finger++;
+    }
+    bool isMultiline = false;
+    if (finger[1]=='\'' && finger[2]=='\'')
+    {
+        isMultiline = true;
+        CUR_TOKEN_TEXT[CUR_TOKEN_LENGTH-2] = '\0';
+        finger += 2;
+    }
+    for (finger++; finger[1]; finger++)
+    {
+        unsigned char next = *finger;
+        size32_t delta = (size32_t)(finger-CUR_TOKEN_TEXT);
+        if (next == '\\')
+        {
+            next = finger[1];
+            if (finger[2]==0)  // finger[1] must be '.
+            {
+                returnToken.setPosition(yyLineNo, oldColumn+delta, oldPosition+delta, querySourcePath());
+                StringBuffer msg("Can not terminate a string with escape char '\\': ");
+                msg.append(CUR_TOKEN_TEXT);
+                reportError(returnToken, RRR_ESCAPE_ENDWITHSLASH, "%s", msg.str());
+                if (checkAborting())
+                    return EOF;
+            }
+            else if (next == '\'' || next == '\\' || next == '?' || next == '"')
+            {
+                finger++;
+            }
+            else if (next == '\n')
+            {
+                finger++;
+                continue;  // A \ at end of line in a multiline constant means remove the end-of-line
+            }
+            else if (next == 'a')
+            {
+                next = '\a';
+                finger++;
+            }
+            else if (next == 'b')
+            {
+                next = '\b';
+                finger++;
+            }
+            else if (next == 'f')
+            {
+                next = '\f';
+                finger++;
+            }
+            else if (next == 'n')
+            {
+                next = '\n';
+                finger++;
+            }
+            else if (next == 'r')
+            {
+                next = '\r';
+                finger++;
+            }
+            else if (next == 't')
+            {
+                next = '\t';
+                finger++;
+            }
+            else if (next == 'v')
+            {
+                next = '\v';
+                finger++;
+            }
+            else if (isdigit(next) && next < '8')
+            {
+                //Allow octal constants for ^Z etc.
+                unsigned value = 0;
+                unsigned count;
+                for (count=0; count < 3; count++)
+                {
+                    next = finger[count+1];
+                    if (!isdigit(next) || next >= '8')
+                        break;
+                    value = value * 8 + (next - '0');
+                }
+                if(count != 3)
+                {
+                    returnToken.setPosition(yyLineNo, oldColumn+delta, oldPosition+delta, querySourcePath());
+                    StringBuffer msg;
+                    msg.append("3-digit numeric escape sequence contained non-octal digit: ").append(next);
+                    reportError(returnToken, ERR_ESCAPE_UNKNOWN, "%s", msg.str());
+                    if (checkAborting())
+                        return EOF;
+                }
+                *bf++ = value;
+                if(!(isValidAsciiLikeCharacter(value) || (tc == type_data)))
+                {
+                    returnToken.setPosition(yyLineNo, oldColumn+delta, oldPosition+delta, querySourcePath());
+                    reportWarning(CategoryCast, returnToken, ERR_STRING_NON_ASCII, "Character in string literal is not defined in encoding " ASCII_LIKE_CODEPAGE);
+                    if (checkAborting())
+                        return EOF;
+                }
+                finger += count;
+                continue;
+            }
+            else
+            {
+                StringBuffer msg;
+                msg.append("Unrecognized escape sequence: ");
+                msg.append("\\").append(finger[1]);
+                returnToken.setPosition(yyLineNo, oldColumn+delta, oldPosition+delta, querySourcePath());
+                reportError(returnToken, ERR_ESCAPE_UNKNOWN, "%s", msg.str());
+                if (checkAborting())
+                    return EOF;
+            }
+            *bf++ = next;
+        }
+        else if (next == '\'' && !isMultiline)
+        {
+            returnToken.setPosition(yyLineNo, oldColumn+delta, oldPosition+delta, querySourcePath());
+            reportError(returnToken, ERR_STRING_NEEDESCAPE,"' needs to be escaped by \\ inside string");
+            if (checkAborting())
+                return EOF;
+        }
+        else if (next >= 128)
+        {
+            const byte * temp = (byte *)finger;
+            unsigned lenLeft = CUR_TOKEN_LENGTH - (size32_t)(finger - CUR_TOKEN_TEXT);
+            int extraCharsRead = rtlSingleUtf8ToCodepage(bf, lenLeft, finger, ASCII_LIKE_CODEPAGE);
+            if (extraCharsRead == -1)
+            {
+                //This really has to be an error, otherwise it will work most of the time, but will then sometimes fail
+                //because two characters > 128 are next to each other.
+                returnToken.setPosition(yyLineNo, oldColumn+delta, oldPosition+delta, querySourcePath());
+                reportError(returnToken, ERR_STRING_NON_ASCII, "Character in string literal is not legal UTF-8");
+                if (checkAborting())
+                    return EOF;
+                *bf = next;
+            }
+            else
+            {
+                if (*bf == ASCII_LIKE_SUBS_CHAR)
+                {
+                    returnToken.setPosition(yyLineNo, oldColumn+delta, oldPosition+delta, querySourcePath());
+                    reportWarning(CategoryCast, returnToken, ERR_STRING_NON_ASCII, "Character in string literal is not defined in encoding " ASCII_LIKE_CODEPAGE ", try using a unicode constant");
+                }
+                finger += extraCharsRead;
+            }
+            bf++;
+        }
+        else
+        {
+            *bf++ = next;
+            if(!(isValidAsciiLikeCharacter(next) || (tc == type_data)))
+            {
+                returnToken.setPosition(yyLineNo, oldColumn+delta, oldPosition+delta, querySourcePath());
+                reportError(returnToken, ERR_STRING_NON_ASCII, "Character in string literal is not defined in encoding " ASCII_LIKE_CODEPAGE);
+                if (checkAborting())
+                    return EOF;
+            }
+        }
+    }
+    returnToken.setPosition(yyLineNo, oldColumn, oldPosition, querySourcePath());
+    switch (tc)
+    {
+    case type_qstring:
+        {
+            Owned<ITypeInfo> qStrType = makeQStringType(UNKNOWN_LENGTH);
+            returnToken.setExpr(createConstant(qStrType->castFrom((size32_t)(bf-b), b)));
+            return (DATA_CONST);
+        }
+    case type_data:
+        {
+            returnToken.setExpr(createConstant(createDataValue(b, (size32_t)(bf-b))));
+            return (DATA_CONST);
+        }
+    case type_varstring:
+        {
+            returnToken.setExpr(createConstant(createVarStringValue((size32_t)(bf-b), b, makeVarStringType(UNKNOWN_LENGTH))));
+            return (DATA_CONST);
+        }
+    case type_string:
+        returnToken.setExpr(createConstant(createStringValue(b, (size32_t)(bf-b))));
+        return (STRING_CONST);
+    }
+    throwUnexpected();
+}
+
 //====================================== Error Reporting  ======================================
 
 bool HqlLex::checkAborting()

+ 44 - 0
testing/regress/ecl/key/strings.xml

@@ -0,0 +1,44 @@
+<Dataset name='Result 1'>
+ <Row><Result_1>Single
+quotes</Result_1></Row>
+</Dataset>
+<Dataset name='Result 2'>
+ <Row><Result_2>Can&apos;t be multiline and must escape embedded quotes</Result_2></Row>
+</Dataset>
+<Dataset name='Result 3'>
+ <Row><Result_3>€</Result_3></Row>
+</Dataset>
+<Dataset name='Result 4'>
+ <Row><Result_4>Can use various prefixes</Result_4></Row>
+</Dataset>
+<Dataset name='Result 5'>
+ <Row><Result_5>37313732373337343735</Result_5></Row>
+</Dataset>
+<Dataset name='Result 6'>
+ <Row><Result_6>ABCDE</Result_6></Row>
+</Dataset>
+<Dataset name='Result 7'>
+ <Row><Result_7>Triple
+quotes can have embedded newlines, but also support
+escape sequence</Result_7></Row>
+</Dataset>
+<Dataset name='Result 8'>
+ <Row><Result_8>Unicode triple
+quotes should be the same, and also support
+escape sequence</Result_8></Row>
+</Dataset>
+<Dataset name='Result 9'>
+ <Row><Result_9>Don&apos;t have to be multiline and need not escape embedded quotes (but &apos;can&apos; if they want)</Result_9></Row>
+</Dataset>
+<Dataset name='Result 10'>
+ <Row><Result_10>€</Result_10></Row>
+</Dataset>
+<Dataset name='Result 11'>
+ <Row><Result_11>Can use same prefixes as single</Result_11></Row>
+</Dataset>
+<Dataset name='Result 12'>
+ <Row><Result_12>37313732373337343735</Result_12></Row>
+</Dataset>
+<Dataset name='Result 13'>
+ <Row><Result_13>ABCDE</Result_13></Row>
+</Dataset>

+ 36 - 0
testing/regress/ecl/strings.ecl

@@ -0,0 +1,36 @@
+/*##############################################################################
+
+    HPCC SYSTEMS software Copyright (C) 2017 HPCC Systems®.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+############################################################################## */
+
+'Single\nquotes';
+u'Can\'t be multiline and must escape embedded quotes';
+u8'€';
+v'Can use various prefixes';
+d'7172737475';
+Q'ABCDE';
+
+
+'''Triple
+quotes can have embedded newlines, but also \
+support\nescape sequence''';
+u'''Unicode triple
+quotes should be the same, and also \
+support\nescape sequence''';
+u'''Don't have to be multiline and need not escape embedded quotes (but \'can' if they want)''';
+u8'''€''';
+v'''Can use same prefixes as single''';
+d'''7172737475''';
+Q'''ABCDE''';