Browse Source

HPCC-21128 Compile-time syntax check for python plugin

Signed-off-by: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 6 years ago
parent
commit
d67a83779f

+ 18 - 0
ecl/hql/hqlgram2.cpp

@@ -950,6 +950,24 @@ IHqlExpression * HqlGram::processEmbedBody(const attribute & errpos, IHqlExpress
         {
             HqlExprArray syntaxCheckArgs;
             embedText->unwindList(syntaxCheckArgs, no_comma);
+            if (matchesBoolean(prebind, true))
+            {
+                // To syntax-check functions with prebind set, we need to pass in the parameter names
+                StringBuffer argnames;
+                HqlExprArray & params = defineScopes.tos().activeParameters;
+                ForEachItemIn(i, params)
+                {
+                    IHqlExpression * param = &params.item(i);
+                    IAtom *name = param->queryName();
+                    if (name)
+                        argnames.append(',').append(name->queryStr());
+                }
+                argnames.append(',').append("__activity__");   // Special parameter indicating activity context - not always passed but won't break anything if we pretend it is (I hope!)
+                if (argnames.length())
+                    syntaxCheckArgs.append(*createConstant(argnames.str()+1));
+                else
+                    syntaxCheckArgs.append(*createConstant(""));  //passing nullptr might be better but not sure how
+            }
             OwnedHqlExpr syntax = createBoundFunction(this, syntaxCheckFunc, syntaxCheckArgs, lookupCtx.functionCache, true);
             OwnedHqlExpr folded = foldHqlExpression(syntax);
             if (folded->queryValue())

+ 64 - 7
plugins/py3embed/py3embed.cpp

@@ -22,6 +22,7 @@
 #include "Python.h"
 #undef ssize_t
 #else
+#define register
 #include "Python.h"
 #endif
 
@@ -41,6 +42,7 @@
 #include "nbcd.hpp"
 #include "roxiemem.hpp"
 #include "enginecontext.hpp"
+#include <regex>
 
 #if PY_MAJOR_VERSION >=3
   #define Py_TPFLAGS_HAVE_ITER 0
@@ -403,6 +405,28 @@ public:
         return getNamedTupleType(names.str());
     }
 
+    StringBuffer & reformatCompilerError(StringBuffer &ret, const char *error, unsigned leadingLines)
+    {
+        // Errors from compiler tend to look like this:
+        // "('invalid syntax', ('<embed>', 3, 12, '     sfsf ss fs dfs f sfs\n'))"
+        const char pattern [] = "\\(\\'(.*)\\', \\(\\'.*\\', ([0-9]*), ([0-9]*), (.*)\\)\\)";
+        // Hopefully there are no embedded quotes in the error message or the filename
+        std::regex regex(pattern);
+        std::cmatch matches;
+        if (std::regex_match(error, matches, regex))
+        {
+            assertex(matches.size()==5);  // matches 0 is the whole string
+            std::string err = matches.str(1);
+            unsigned line = atoi(matches.str(2).c_str());
+            unsigned col = atoi(matches.str(3).c_str());
+            std::string extra = matches.str(4);
+            if (line > leadingLines)
+                line--;
+            return ret.appendf("(%u, %u): %s: %s", line, col, err.c_str(), extra.c_str());
+        }
+        else
+            return ret.append(error);
+    }
     PyObject *compileScript(const char *text, const char *parameters)
     {
         // Note - we do not need (and must not have) a lock protecting this. It is protected by the Python GIL,
@@ -414,6 +438,7 @@ public:
         code.set(PyDict_GetItemString(compiledScripts, text));
         if (!code)
         {
+            unsigned leadingLines = (unsigned) -1;  // Number of lines from input that have not been offset by 1 line in input to compiler
             code.setown(Py_CompileString(text, "", Py_eval_input));   // try compiling as simple expression...
             if (!code)
             {
@@ -424,11 +449,24 @@ public:
                 {
                     PyErr_Clear();
                     StringBuffer wrapped;
-                    wrapPythonText(wrapped, text, parameters);
+                    wrapPythonText(wrapped, text, parameters, leadingLines);
                     code.setown(Py_CompileStringFlags(wrapped, "<embed>", Py_file_input, &flags)); // try compiling as a function body
                 }
             }
-            checkPythonError();
+            PyObject* err = PyErr_Occurred();
+            if (err)
+            {
+                OwnedPyObject pType, pValue, pTraceBack;
+                PyErr_Fetch(pType.ref(), pValue.ref(), pTraceBack.ref());
+                OwnedPyObject valStr = PyObject_Str(pValue);
+                PyErr_Clear();
+                // We reformat the error message a little, to make it more helpful
+                assertex(PyUnicode_Check(valStr));
+                const char *errtext = PyUnicode_AsUTF8AndSize(valStr, NULL);
+                StringBuffer msg;
+                reformatCompilerError(msg, errtext, leadingLines);
+                rtlFail(0, msg.str());
+            }
             if (code)
                 PyDict_SetItemString(compiledScripts, text, code);
         }
@@ -460,7 +498,7 @@ public:
     }
     static void unregister(const char *key);
 protected:
-    static StringBuffer &wrapPythonText(StringBuffer &out, const char *in, const char *params)
+    static StringBuffer &wrapPythonText(StringBuffer &out, const char *in, const char *params, unsigned &leadingLines)
     {
         // Complicated by needing to keep future import lines outside defined function
         // Per python spec, a future statement must appear near the top of the module. The only lines that can appear before a future statement are:
@@ -474,7 +512,7 @@ protected:
         StringArray lines;
         lines.appendList(in, "\n", false);
         RegExpr expr("^ *from +__future__ +import ");
-        unsigned leadingLines = 0;
+        leadingLines = 0;
         ForEachItemIn(idx, lines)
         {
             if (expr.find(lines.item(idx)))
@@ -1635,7 +1673,6 @@ public:
     {
         addArg(name, createECLDatasetIterator(metaVal.queryTypeInfo(), LINK(val)));
     }
-
 protected:
     virtual void addArg(const char *name, PyObject *arg) = 0;
 
@@ -1685,6 +1722,7 @@ public:
         argstring.append("__activity__");
     }
 
+
     virtual void callFunction()
     {
         result.setown(PyEval_EvalCode(script.get(), globals, locals));
@@ -1694,6 +1732,10 @@ public:
         if (!result || result == Py_None)
             result.set(PyDict_GetItemString(locals, "__result__"));
     }
+    void setargs(const char *args)
+    {
+        argstring.set(args);
+    }
 protected:
     virtual void addArg(const char *name, PyObject *arg)
     {
@@ -1796,9 +1838,24 @@ extern DECL_EXPORT IEmbedContext* getEmbedContext()
     return new Python3xEmbedContext;
 }
 
-extern DECL_EXPORT bool syntaxCheck(const char *script)
+extern DECL_EXPORT void syntaxCheck(size32_t & __lenResult,char * & __result,size32_t charsBody,const char * body, const char * parms)
 {
-    return true; // MORE
+    StringBuffer result;
+    try
+    {
+        checkThreadContext();
+        Owned<Python3xEmbedScriptContext> ctx = new Python3xEmbedScriptContext(threadContext);
+        ctx->setargs(parms);
+        ctx->compileEmbeddedScript(charsBody, body);
+    }
+    catch (IException *E)
+    {
+        StringBuffer msg;
+        result.append(E->errorMessage(msg));
+        E->Release();
+    }
+    __lenResult = result.length();
+    __result = result.detach();
 }
 
 } // namespace

+ 1 - 1
plugins/py3embed/python3.ecllib

@@ -17,7 +17,7 @@
 
 EXPORT Language := SERVICE : plugin('py3embed')
   integer getEmbedContext():cpp,pure,fold,namespace='py3embed',entrypoint='getEmbedContext',prototype='IEmbedContext* getEmbedContext()';
-  boolean syntaxCheck(const varstring src):cpp,pure,namespace='py3embed',entrypoint='syntaxCheck';
+  STRING syntaxCheck(UTF8 body, const VARSTRING params):cpp,pure,namespace='py3embed',entrypoint='syntaxCheck',fold;
 END;
 EXPORT getEmbedContext := Language.getEmbedContext;
 EXPORT syntaxCheck := Language.syntaxCheck;

+ 61 - 11
plugins/pyembed/pyembed.cpp

@@ -22,6 +22,7 @@
 #include "Python.h"
 #undef ssize_t
 #else
+#define register
 #include "Python.h"
 #endif
 
@@ -40,6 +41,7 @@
 #include "nbcd.hpp"
 #include "roxiemem.hpp"
 #include "enginecontext.hpp"
+#include <regex>
 
 static const char * compatibleVersions[] = {
     "Python2.7 Embed Helper 1.0.0",
@@ -400,6 +402,28 @@ public:
         return getNamedTupleType(names.str());
     }
 
+    StringBuffer & reformatCompilerError(StringBuffer &ret, const char *error, unsigned leadingLines)
+    {
+        // Errors from compiler tend to look like this:
+        // "('invalid syntax', ('<embed>', 3, 12, '     sfsf ss fs dfs f sfs\n'))"
+        const char pattern [] = "\\(\\'(.*)\\', \\(\\'.*\\', ([0-9]*), ([0-9]*), (.*)\\)\\)";
+        // Hopefully there are no embedded quotes in the error message or the filename
+        std::regex regex(pattern);
+        std::cmatch matches;
+        if (std::regex_match(error, matches, regex))
+        {
+            assertex(matches.size()==5);  // matches 0 is the whole string
+            std::string err = matches.str(1);
+            unsigned line = atoi(matches.str(2).c_str());
+            unsigned col = atoi(matches.str(3).c_str());
+            std::string extra = matches.str(4);
+            if (line > leadingLines)
+                line--;
+            return ret.appendf("(%u, %u): %s: %s", line, col, err.c_str(), extra.c_str());
+        }
+        else
+            return ret.append(error);
+    }
     PyObject *compileScript(const char *text, const char *parameters)
     {
         // Note - we do not need (and must not have) a lock protecting this. It is protected by the Python GIL,
@@ -411,6 +435,7 @@ public:
         code.set(PyDict_GetItemString(compiledScripts, text));
         if (!code)
         {
+            unsigned leadingLines = (unsigned) -1;  // Number of lines from input that have not been offset by 1 line in input to compiler
             code.setown(Py_CompileString(text, "", Py_eval_input));   // try compiling as simple expression...
             if (!code)
             {
@@ -421,11 +446,22 @@ public:
                 {
                     PyErr_Clear();
                     StringBuffer wrapped;
-                    wrapPythonText(wrapped, text, parameters);
+                    wrapPythonText(wrapped, text, parameters, leadingLines);
                     code.setown(Py_CompileStringFlags(wrapped, "<embed>", Py_file_input, &flags)); // try compiling as a function body
                 }
             }
-            checkPythonError();
+            PyObject* err = PyErr_Occurred();
+            if (err)
+            {
+                OwnedPyObject pType, pValue, pTraceBack;
+                PyErr_Fetch(pType.ref(), pValue.ref(), pTraceBack.ref());
+                OwnedPyObject valStr = PyObject_Str(pValue);
+                PyErr_Clear();
+                // We reformat the error message a little, to make it more helpful
+                StringBuffer msg;
+                reformatCompilerError(msg, PyString_AsString(valStr), leadingLines);
+                rtlFail(0, msg.str());
+            }
             if (code)
                 PyDict_SetItemString(compiledScripts, text, code);
         }
@@ -457,7 +493,7 @@ public:
     }
     static void unregister(const char *key);
 protected:
-    static StringBuffer &wrapPythonText(StringBuffer &out, const char *in, const char *params)
+    static StringBuffer &wrapPythonText(StringBuffer &out, const char *in, const char *params, unsigned &leadingLines)
     {
         // Complicated by needing to keep future import lines outside defined function
         // Per python spec, a future statement must appear near the top of the module. The only lines that can appear before a future statement are:
@@ -471,7 +507,7 @@ protected:
         StringArray lines;
         lines.appendList(in, "\n", false);
         RegExpr expr("^ *from +__future__ +import ");
-        unsigned leadingLines = 0;
+        leadingLines = 0;
         ForEachItemIn(idx, lines)
         {
             if (expr.find(lines.item(idx)))
@@ -1612,7 +1648,6 @@ public:
     {
         addArg(name, createECLDatasetIterator(metaVal.queryTypeInfo(), LINK(val)));
     }
-
 protected:
     virtual void addArg(const char *name, PyObject *arg) = 0;
 
@@ -1672,6 +1707,10 @@ public:
         if (!result || result == Py_None)
             result.set(PyDict_GetItemString(locals, "__result__"));
     }
+    void setargs(const char *args)
+    {
+        argstring.set(args);
+    }
 protected:
     virtual void addArg(const char *name, PyObject *arg)
     {
@@ -1774,9 +1813,24 @@ extern DECL_EXPORT IEmbedContext* getEmbedContext()
     return new Python27EmbedContext;
 }
 
-extern DECL_EXPORT bool syntaxCheck(const char *script)
+extern DECL_EXPORT void syntaxCheck(size32_t & __lenResult,char * & __result,size32_t charsBody,const char * body, const char * parms)
 {
-    return true; // MORE
+    StringBuffer result;
+    try
+    {
+        checkThreadContext();
+        Owned<Python27EmbedScriptContext> ctx = new Python27EmbedScriptContext(threadContext);
+        ctx->setargs(parms);
+        ctx->compileEmbeddedScript(charsBody, body);
+    }
+    catch (IException *E)
+    {
+        StringBuffer msg;
+        result.append(E->errorMessage(msg));
+        E->Release();
+    }
+    __lenResult = result.length();
+    __result = result.detach();
 }
 
 } // namespace
@@ -1790,9 +1844,5 @@ extern DECL_EXPORT IEmbedContext* getEmbedContext()
     return new py2embed::Python27EmbedContext;
 }
 
-extern DECL_EXPORT bool syntaxCheck(const char *script)
-{
-    return true; // MORE
-}
 
 } // namespace

+ 1 - 1
plugins/pyembed/python.ecllib

@@ -17,7 +17,7 @@
 
 EXPORT Language := SERVICE : plugin('py2embed')
   integer getEmbedContext():cpp,pure,fold,namespace='py2embed',entrypoint='getEmbedContext',prototype='IEmbedContext* getEmbedContext()';
-  boolean syntaxCheck(const varstring src):cpp,pure,namespace='py2embed',entrypoint='syntaxCheck';
+  STRING syntaxCheck(UTF8 body, const VARSTRING params):cpp,pure,namespace='py2embed',entrypoint='syntaxCheck',fold;
 END;
 EXPORT getEmbedContext := Language.getEmbedContext;
 EXPORT syntaxCheck := Language.syntaxCheck;