WorkUnits.rst 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954
  1. =======================
  2. Understanding workunits
  3. =======================
  4. Introduction
  5. ************
  6. A workunit contains all the information that the system requires about a query - including the parameters it takes,
  7. how to execute it, and how to format the results. Understanding the contents of a workunit is a key step to
  8. understanding how the HPCC system fits together. This document begins with an overview of the different elements
  9. in a workunit. That is then followed by a walk-through executing a simple query, with a more detailed description of
  10. some of the workunit components to show how they all tie together.
  11. Before looking at the contents of a workunit it is important to understand one of the design goals behind the HPCC
  12. system. The HPCC system logically splits the code that is needed to execute a query in two. On the one hand there
  13. are the algorithms that are used to perform different dataset operations (e.g. sorting, deduping). The same algorithms are
  14. used by all the queries that execute on the system. On the other hand there is the meta-data that describes the columns
  15. present in the datasets, which columns you need to sort by, and the order of operations required by the query. These
  16. are typically different for each query. This "meta-data" includes generated compare functions, classes that describe
  17. the record formats, serialized data and graphs.
  18. A workunit only contains data and code relating to the meta data for the query, i.e. "what to do", while the
  19. different engines (hthor, roxie, and thor) implement the algorithms - "how to do it". If you look at a workunit for
  20. an ECL query that sorts a dataset you will not find code to perform the sort itself in the workunit - you will not
  21. even find a call to a sort library function - that logic is contained in the engine that executes the query.
  22. One consequence of this split, which can be initially confusing, is that execution continually passes back and forth
  23. between the generated code and the engines. By the end of this document you should have a better understanding
  24. of how the generated code is structured and the part it plays in executing a query.
  25. Note the term "Query" is used as a generic term to cover read-only queries (typically run in roxie) and ETL (Extract,
  26. Transform, and Load) style
  27. queries that create lots of persistent datafiles (typically run in Thor). Also, the term "workunit" is used
  28. ambiguously. The dll created from a query is called a workunit (which is static), but "workunit" is also
  29. used to describe a single execution of a query (which includes the parameters and results). It should be clear from
  30. the context which of these is meant.
  31. Throughout this document "dll" is a generic term used to refer to a dynamically loaded library. These correspond
  32. to shared objects in Linux (extension '.so'), dynamic libraries in Max OS X ('.dylib'), and dynamic
  33. link libraries in windows ('.dll').
  34. Contents of a workunit
  35. ======================
  36. A workunit is generated by the ecl compiler, and consists of a single dll. That dll contains
  37. several different elements:
  38. * Various C++ helper classes, and exported factory methods are used to create instances of those classes.
  39. * An XML resource containing different pieces of information about a workunit, including workflow and graphs.
  40. * Other user-defined resources included in the manifest.
  41. A workunit dll contains everything that the engines need to execute the query. When a workunit is executed, key
  42. elements of the xml information are cloned from the workunit dll and copied into a database. This is then
  43. augmented with other information as the query is executed - e.g. input parameters, results, statistics, etc.. The
  44. contents of the workunit are accessed through an "IWorkUnit" interface
  45. (defined in common/workunit/workunit.hpp) that hides the implementation details.
  46. (Workunit information is currently stored in the Dali database - one of the components within the HPCC platform.
  47. Work is in-progress to allow the bulk of this workunit data to be stored in Cassandra or another third-party database
  48. instead.)
  49. How is the workunit used?
  50. =========================
  51. The workunit information is used by most of the components in the system. The following is a quick outline:
  52. * eclcc
  53. Creates a workunit dll from an ecl query.
  54. * eclccserver
  55. Executes eclcc to create a workunit dll, and then clones some of the information into dali to create an active instance, ready to execute.
  56. * esp
  57. Uses information in the workunit dll to publish workunits. This includes details of the parameters that the query takes, how
  58. they should be formatted, and the results it returns.
  59. * eclscheduler
  60. Monitors workunits that are waiting for events, and updates them when those events occur.
  61. * eclagent/Roxie
  62. Process the different workflow actions, and workflow code.
  63. * hThor/Roxie/Thor
  64. Execute graphs within the workflow items.
  65. * Dali
  66. This database is used to store the state of the workunit state.
  67. Example
  68. *******
  69. The following ECL will be used as an example for the rest of the discussion. It is a very simple search that
  70. takes a string parameter 'searchName', which is the name of the person to search for, and returns the matching records
  71. from an index. It also outputs the word 'Done!' as a separate result.
  72. ::
  73. STRING searchName := 'Smith' : STORED('searchName');
  74. nameIndex := INDEX({ STRING40 name, STRING80 address }, 'names');
  75. results := nameIndex(KEYED(name = searchName));
  76. OUTPUT(results);
  77. OUTPUT('Done!');
  78. Extracts from the XML and C++ that are generated from this example will be included in the following discussion.
  79. Workunit Main Elements
  80. **********************
  81. This section outlines the different sections in a workunit. This is followed by a walk-through of the stages that
  82. take place when a workunit is executed, together with a more detailed explanation of the workunit contents.
  83. Workflow
  84. ========
  85. The workflow is the highest level of control within a workunit. It is used for two related purposes:
  86. - Scheduling
  87. The HPCC system allows ECL code to be executed when certain events occur - e.g. every hour
  88. or when files are uploaded to a landing zone. Each piece of ECL code that is triggered by an external event
  89. creates a separate workflow action. This allows each of those events to be processed independently.
  90. - Splitting up queries.
  91. There are situations where it is useful to break parts of an ECL query into independent sections. The simplest example
  92. is the PERSIST workflow operation, which allows results to be shared between different work units. Each workflow operation
  93. creates one (or sometimes more) independent workflow items, which are then connected together.
  94. Each piece of independent ECL is given a unique workflow id (wfid). Often workflow items need to be executed in a particular order,
  95. e.g. ensuring a persist exists before using it, which is managed with dependencies between different workflow items.
  96. Our example above generates the following XML entry in the workunit::
  97. <Workflow>
  98. <Item .... wfid="1"/>
  99. <Item .... wfid="2">
  100. <Dependency wfid="1"/>
  101. <Schedule/>
  102. </Item>
  103. </Workflow>
  104. This contains two workflow items. The first workflow item (wfid=1) ensures that the stored value has a default value if it has not been supplied.
  105. The second item (with wfid=2) is the main code for the query. This has a dependency on the first workflow item
  106. because the stored variable needs to be intialised before it is executed.
  107. MyProcess
  108. =========
  109. The generated code contains a class instance that is used for executing the code associated with the workflow items. It is generated at the end of the main C++ module. E.g.::
  110. struct MyEclProcess : public EclProcess {
  111. virtual int perform(IGlobalCodeContext * gctx, unsigned wfid) {
  112. ....
  113. switch (wfid) {
  114. case 1U:
  115. ... code for workflow item 1 ...
  116. case 2U:
  117. ... code for workflow item 2 ...
  118. break;
  119. }
  120. return 2U;
  121. }
  122. };
  123. The main element is a switch statement inside the perform() function that allows the workflow engines to execute the code associated with a particular workflow item.
  124. There is also an associated factory function that is exported from the dll, and is used by the engines to create instances of the class::
  125. extern "C" ECL_API IEclProcess* createProcess()
  126. {
  127. return new MyEclProcess;
  128. }
  129. Graph
  130. =====
  131. Most of the work executing a query involves processing dataset operations, which are implemented as
  132. a graph of activities. Each graph is represented in the workunit as an xml graph structure (currently
  133. it uses the xgmml format). The graph xml includes details of which types of activities are required to be
  134. executed, how they are linked together, and any other dependencies.
  135. The graph in our example is particularly simple::
  136. <Graph name="graph1" type="activities">
  137. <xgmml>
  138. <graph wfid="2">
  139. <node id="1">
  140. <att>
  141. <graph>
  142. <att name="rootGraph" value="1"/>
  143. <edge id="2_0" source="2" target="3"/>
  144. <node id="2" label="Index Read&#10;&apos;names&apos;">
  145. ... attributes for activity 2 ...
  146. </node>
  147. <node id="3" label="Output&#10;Result #1">
  148. ... attributes for activity 3 ...
  149. </node>
  150. </graph>
  151. </att>
  152. </node>
  153. </graph>
  154. </xgmml>
  155. </Graph>
  156. This graph contains a single subgraph (node id=2) that contains two activities - an index read activity and an output result
  157. activity. These activities are linked by a single edge (id "2_0"). The details of the contents are covered in the section on
  158. executing graphs below.
  159. Generated Activity Helpers
  160. ==========================
  161. Each activity has a corresponding class instance in the generated code, and a factory function for creating instances of that class::
  162. struct cAc2 : public CThorIndexReadArg {
  163. ... Implementation of the helper for activity #2 ...
  164. };
  165. extern "C" ECL_API IHThorArg * fAc2() { return new cAc2; }
  166. struct cAc3 : public CThorWorkUnitWriteArg {
  167. ... Implementation of the helper for activity #3 ...
  168. };
  169. extern "C" ECL_API IHThorArg * fAc3() { return new cAc3; }
  170. The helper class for an activity implements the interface that is required for that particular kind. (The interfaces are defined in
  171. rtl/include/eclhelper.hpp - further details below.)
  172. Other
  173. =====
  174. The are several other items, detailed below, that are logically associated with a workunit. The information may be
  175. stored in the workunit dll or in various other location e.g. Dali, Sasha or Cassandra. It is all accessed through the IWorkUnit
  176. interface in common/workunit/workunit.hpp that hides the implementation details. For instance information generated at runtime cannot
  177. by definition be included in the workunit dll.
  178. Options
  179. -------
  180. Options that are supplied to eclcc via the -f command line option, or the #option statement are included in the <Debug> section of the workunit xml::
  181. <Debug>
  182. <addtimingtoworkunit>0</addtimingtoworkunit>
  183. <noterecordsizeingraph>1</noterecordsizeingraph>
  184. <showmetaingraph>1</showmetaingraph>
  185. <showrecordcountingraph>1</showrecordcountingraph>
  186. <spanmultiplecpp>0</spanmultiplecpp>
  187. <targetclustertype>hthor</targetclustertype>
  188. </Debug>
  189. Note, the names of workunit options are case insensitive, and converted to lower case.
  190. Input Parameters
  191. ----------------
  192. Many queries contain input parameters that modify their behaviour. These
  193. correspond to STORED definitions in the ECL. Our example contains a single string "searchName", so the
  194. workunit contains a single input parameter::
  195. <Variables>
  196. <Variable name="searchname">
  197. <SchemaRaw xsi:type="SOAP-ENC:base64">
  198. searchname&#xe000;&#xe004;&#241;&#255;&#255;&#255;&#xe001;ascii&#xe000;&#xe001;ascii&#xe000;&#xe000;&#xe018;&#xe000;&#xe000;&#xe000;&#xe000; </SchemaRaw>
  199. </Variable>
  200. </Variables>
  201. The implementation details of the schema information is encapsulated by the IConstWUResult interface in workunit.hpp.
  202. Results
  203. -------
  204. The workunit xml also contains details of each result that the query generates, including a serialized description of the output record format::
  205. <Results>
  206. <Result isScalar="0"
  207. name="Result 1"
  208. recordSizeEntry="mf1"
  209. rowLimit="-1"
  210. sequence="0">
  211. <SchemaRaw xsi:type="SOAP-ENC:base64">
  212. name&#xe000;&#xe004;(&#xe000;&#xe000;&#xe000;&#xe001;ascii&#xe000;&#xe001;ascii&#xe000;address&#xe000;&#xe004;P&#xe000;&#xe000;&#xe000;&#xe001;ascii&#xe000;&#xe001;ascii&#xe000;&#xe000;&#xe018;%&#xe000;&#xe000;&#xe000;{ string40 name, string80 address };&#10; </SchemaRaw>
  213. </Result>
  214. <Result name="Result 2" sequence="1">
  215. <SchemaRaw xsi:type="SOAP-ENC:base64">
  216. Result_2&#xe000;&#xe004;&#241;&#255;&#255;&#255;&#xe001;ascii&#xe000;&#xe001;ascii&#xe000;&#xe000;&#xe018;&#xe000;&#xe000;&#xe000;&#xe000; </SchemaRaw>
  217. </Result>
  218. </Results>
  219. in our example there are two - the dataset of results and the text string "Done!". The values of the results for
  220. a query are associated with the workunit. (They are currently saved in dali, but this may change in version 6.0.)
  221. Timings and Statistics
  222. ----------------------
  223. Any timings generated when compiling the query are included in the workunit dll::
  224. <Statistics>
  225. <Statistic c="eclcc"
  226. count="1"
  227. creator="eclcc"
  228. kind="SizePeakMemory"
  229. s="compile"
  230. scope="compile"
  231. ts="1428933081084000"
  232. unit="sz"
  233. value="27885568"/>
  234. </Statistics>
  235. Other statistics and timings created when running the query are stored in the runtime copy of the workunit.
  236. (Statistics for graph elements are stored in a different format from global statistics, but the IWorkUnit interface
  237. ensures the implementation details are hidden.)
  238. Manifests
  239. ---------
  240. It is possible to include other user-defined resources in the workunit dll - e.g. web pages, or dashboard layouts.
  241. I have to confess I do not understand them... ??Tony please provide some more information....!
  242. Stages of Execution
  243. *******************
  244. Once a workunit has been compiled to a dll it is ready to be executed. Execution can be triggered in different ways, E.g.:
  245. * The ECL for a query is submitted to esp
  246. - A workunit entry, containing the ECL, is created in dali and added to an eclccserver queue.
  247. - An eclccserver instance removes the workunit form the queue, and compiles the ECL to a workunit dll.
  248. - The dali workunit entry is updated with the information from the workunit dll.
  249. - The dali workunit is added to the agent execution queue associated with the target cluster.
  250. - The associated engine (actually agentexec for hThor and Thor) pulls a query form the queue and executes it.
  251. * A query is submitted and published with a name. Another request is then submitted to execute this previously compiled query.
  252. - A workunit entry, containing the ECL, is created in dali and added to an eclccserver queue.
  253. - An eclccserver instance removes the workunit form the queue, and compiles the ECL to a workunit dll.
  254. - There is a 'query set' for each combination of query name and the target cluster. The new workunit
  255. dll is added to the appropriate query set, and marked as the current active implementation.
  256. - Later, a query that references a named query is submitted to esp.
  257. - The name and target cluster are mapped via the query set to the active implementation, and a workunit instance is created from the active workunit dll.
  258. - The workunit is added to a roxie or eclagentexec queue ready to be executed.
  259. - The associated engine pulls a query form the queue and executes it.
  260. * A query is compiled as a stand alone executable. The executable is then run.
  261. - eclcc is executed on the command line without the -shared command line option.
  262. - The resulting executable is run. The engine used to execute the query depends on the -platform
  263. parameter supplied to eclcc.
  264. Most queries create persistent workunits in dali and then update those workunits with results as they are calculated,
  265. however for some roxie queries (e.g. in a production system) the execution workunits are transient.
  266. The following walk-through details the main stages executing a query, and the effect each of the query elements has.
  267. Queues
  268. ======
  269. The system uses several inter-process queues to communicate between the different components in the system. These queues
  270. are implemented by dali. Components can subscribe to one or more queues, and receive notifications when entries are
  271. avaialable.
  272. Some example queues are:
  273. * <cluster>.eclserver - workunits to be compiled
  274. * <cluster>.roxie - workunits to execute on roxie
  275. * <cluster>.thor - graphs to execute on thor
  276. * <cluster>.eclscheduler - workunits that need to wait for events
  277. * <cluster>.agent - workunits to be executed on hthor or thor.
  278. * dfuserver_queue - dfu workunits for sprays/file copies etc.
  279. Workflow
  280. ========
  281. When a workunit is ready to be run, the workflow controls the flow of execution. The workflow engine
  282. (defined in common/workunit/workflow.cpp) is responsible for determining which workflow item should be
  283. executed next.
  284. The workflow for Thor and hThor jobs is coordinated by eclagent, while roxie includes the workflow
  285. engine in its process. The eclscheduler also uses the workflow engine to process events and mark workflow items ready
  286. for execution.
  287. eclagent, or roxie calls the createProcess() function from the workunit dll to create an instance of the generated workflow
  288. helper class, and passes it to the workflow engine. The workflow engine walks the workflow items to find any items
  289. that are ready to be executed (have the state "reqd" - i.e. required). If a required workflow item has dependencies on
  290. other child workflow items then those children are executed first. Once all dependencies have executed successfully
  291. the parent workflow item is executed. The example has the following workflow entries::
  292. <Workflow>
  293. <Item mode="normal"
  294. state="null"
  295. type="normal"
  296. wfid="1"/>
  297. <Item mode="normal"
  298. state="reqd"
  299. type="normal"
  300. wfid="2">
  301. <Dependency wfid="1"/>
  302. <Schedule/>
  303. </Item>
  304. </Workflow>
  305. Item 2 has a state of "reqd", so it should be evaluated now. Item 2 has a dependency on item 1, so that must be
  306. evaluated first. This is achieved by calling MyEclProcess::perform() on the object that was previously created
  307. from the workunit dll, passing in wfid = 1. That will execute the following code::
  308. switch (wfid) {
  309. case 1U:
  310. if (!gctx->isResult("searchname",4294967295U)) {
  311. ctx->setResultString("searchname",4294967295U,5U,"Smith");
  312. }
  313. break;
  314. break;
  315. }
  316. This checks if a value has been provided for the input parameter, and if not assigns a default value of "Smith".
  317. The function returns control to the workflow engine. With the dependencies for wfid 2 now satisfied,
  318. the generated code for that workflow id is now executed::
  319. switch (wfid) {
  320. case 2U: {
  321. ctx->executeGraph("graph1",false,0,NULL);
  322. ctx->setResultString(0,1U,5U,"Done!");
  323. }
  324. break;
  325. }
  326. Most of the work for this workflow item involves executing graph1 (by calling back into eclagent/roxie). However, the
  327. code also directly sets another result. This is fairly typical - the code inside MyProcess::perform often combines
  328. evaluating scalar results, executing graphs, and calling functions
  329. that cannot (currently) be called inside a graph (e.g. those involving superfile transactions).
  330. Once all of the required workflow items are executed, the workunit is marked as completed. Alternatively, if there are
  331. workflow items that are waiting to be triggered by an event, the workunit will be passed to the scheduler,
  332. which will keep monitoring for events.
  333. There are various specialised types of workflow items - e.g. sequential, wait, independent, but they all follow the same basic approach of
  334. executing dependencies and then executing that particular item.
  335. Most of the interesting work in an ECL query is done within a graph. The call ctx->executeGraph will either execute the graph locally
  336. (in the case of hthor and roxie), or add the workunit onto a queue (for Thor). Whichever happens, control will pass to that engine.
  337. Graph Execution
  338. ===============
  339. All the engines (roxie, hThor, Thor) execute graphs in a very similar way. The main differences are that hThor and
  340. Thor execute a sub graph at a time, while roxie executes a complete graph as one. Roxie is also optimized to minimize
  341. the overhead of executing a query - since the same query tends to be run multiple times. This means that roxie creates a
  342. graph of factory objects and those are then used to create the activities. The core details are the same for each of them though.
  343. Details of the graph structure
  344. ------------------------------
  345. First, a recap of the structure of the graph together with the full xml for the graph definition in our example::
  346. <Graph name="graph1" type="activities">
  347. <xgmml>
  348. <graph wfid="2">
  349. <node id="1">
  350. <att>
  351. <graph>
  352. <att name="rootGraph" value="1"/>
  353. <edge id="2_0" source="2" target="3"/>
  354. <node id="2" label="Index Read&#10;&apos;names&apos;">
  355. <att name="definition" value="workuniteg1.ecl(3,1)"/>
  356. <att name="name" value="results"/>
  357. <att name="_kind" value="77"/>
  358. <att name="ecl" value="INDEX({ string40 name, string80 address }, &apos;names&apos;, fileposition(false));&#10;FILTER(KEYED(name = STORED(&apos;searchname&apos;)));&#10;"/>
  359. <att name="recordSize" value="120"/>
  360. <att name="predictedCount" value="0..?[disk]"/>
  361. <att name="_fileName" value="names"/>
  362. </node>
  363. <node id="3" label="Output&#10;Result #1">
  364. <att name="definition" value="workuniteg1.ecl(4,1)"/>
  365. <att name="_kind" value="16"/>
  366. <att name="ecl" value="OUTPUT(..., workunit);&#10;"/>
  367. <att name="recordSize" value="120"/>
  368. </node>
  369. </graph>
  370. </att>
  371. </node>
  372. </graph>
  373. </xgmml>
  374. </Graph>
  375. Each graph (e.g. graph1) consists of 1 or more subgraphs (in the example above, node id=1). Each of those subgraphs contains 1
  376. or more activities (node id=2, node id=3).
  377. The xml for each activity might contain the following information:
  378. * A unique id (e.g. id="2").
  379. * The "kind" of the activity, e.g. <att name="_kind" value="77"/>. The value is an item from the enum ThorActivityKind in eclhelper.hpp.
  380. * The ECL that created the activity. E.g. <att name="ecl" value="...">
  381. * The identifier of the ecl definition. E.g. <att name="name" value="results"/>
  382. * Location (e.g. file, line number, column) of the original ECL. E.g. <att name="definition" value="workuniteg1.ecl(3,1)"/>
  383. * Meta information the code generator has deduced about the activity output. Examples include the
  384. record size, expected number of rows, sort order etc. E.g. <att name="recordSize" value="120"/>
  385. * Hints, which are used for fine control of options for a particular activity (e.g,, the number of threads to use while sorting).
  386. * Record counts and stats once the job has executed. (These are logically associated with the activities in the graph, but stored separately.)
  387. Graphs also contain edges that can take one of 3 forms:
  388. Edges within graphs
  389. These are used to indicate how the activities are connected. The source activity is used as the input to the target activity.
  390. These edges have the following format::
  391. <edge id="<source-activity-id>_<output-count>" source="<source-activity-id>" target="<target-activity-id">
  392. There is only one edge in our example workunit: <edge id="2_0" source="2" target="3"/>.
  393. Edges between graphs
  394. These are used to indicate direct dependencies between activities. For instance there will be an edge connecting the activity that
  395. writes a spill file to the activity that reads it. These edges have the following format::
  396. <edge id="<source-activity-id>_<target-activity-id>" source="<source-subgraph-id>" target="<target-subgraph-id>"
  397. <att name="_sourceActivity" value="<source-activity-id>"/>
  398. <att name="_targetActivity" value="<target-activity-id>"/>
  399. </edge>
  400. Roxie often optimizes spilled datasets and treats these edges as equivalent to the edges between activities.
  401. Other dependencies.
  402. These are similar to the edges between graphs, but they are used for values that are used within an activity. For
  403. instance one part of the graph may calculate the maximum value of a column, and another activity may filter out
  404. all records that do not match that maximum value. The format is the same as the edges between graphs except that the
  405. edge contains the following attribute::
  406. <att name="_dependsOn" value="1"/>
  407. Each activity in a graph also has a corresponding helper class instance in the generated code. (The
  408. name of the class is "cAc" followed by the activity number, and the exported factory method is "fAc"
  409. followed by the activity number.) Each helper class implements a specialised interface (derived from IHThorArg) - the
  410. particular interface is determined by the value of the "_kind" attribute for the activity.
  411. The contents of file rtl/include/eclhelper.hpp is key to understanding how the generated code relates to the activities.
  412. Each kind of activity requires a helper class that implements a specific interface. The helpers allow the engine to
  413. tailor the generalised activity implementation to the the particular instance - e.g. for a filter activity
  414. whether a row should be included or excluded. The appendix at the end of this document contains some further information
  415. about this file.
  416. The classes in the generated workunits are normally derived from base implementations of those interfaces (which are
  417. implemented in rtl/include/eclhelper_base.hpp). This reduces the size of the generated code by providing default implementations
  418. for various functions.
  419. For instance the helper for the index read (activity 2) is defined as::
  420. struct cAc2 : public CThorIndexReadArg {
  421. virtual unsigned getFormatCrc() {
  422. return 470622073U;
  423. }
  424. virtual bool getIndexLayout(size32_t & __lenResult, void * & __result) { getLayout5(__lenResult, __result, ctx); return true; }
  425. virtual IOutputMetaData * queryDiskRecordSize() { return &mx1; }
  426. virtual IOutputMetaData * queryOutputMeta() { return &mx1; }
  427. virtual void onCreate(ICodeContext * _ctx, IHThorArg *, MemoryBuffer * in) {
  428. ctx = _ctx;
  429. ctx->getResultString(v2,v1.refstr(),"searchname",4294967295U);
  430. }
  431. rtlDataAttr v1;
  432. unsigned v2;
  433. virtual const char * getFileName() {
  434. return "names";
  435. }
  436. virtual void createSegmentMonitors(IIndexReadContext *irc) {
  437. Owned<IStringSet> set3;
  438. set3.setown(createRtlStringSet(40));
  439. char v4[40];
  440. rtlStrToStr(40U,v4,v2,v1.getstr());
  441. if (rtlCompareStrStr(v2,v1.getstr(),40U,v4) == 0) {
  442. set3->addRange(v4,v4);
  443. }
  444. irc->append(createKeySegmentMonitor(false, set3.getClear(), 0, 40));
  445. irc->append(createWildKeySegmentMonitor(40, 80));
  446. }
  447. virtual size32_t transform(ARowBuilder & crSelf, const void * _left) {
  448. crSelf.getSelf();
  449. unsigned char * left = (unsigned char *)_left;
  450. memcpy(crSelf.row() + 0U,left + 0U,120U);
  451. return 120U;
  452. }
  453. };
  454. Some of the methods to highlight are:
  455. a) onCreate() - common to all activities. It is called by the engine when the helper is first created,
  456. and allows the helper to cache information that does not change - in this case the name that is
  457. being searched for.
  458. b) getFileName() - determines the name of the index being read.
  459. c) createSegmentMonitors() - defines which columns to filter, and which values to match against.
  460. d) transform() - create the record to return from the activity. It controls which columns should be included
  461. from the index row in the output. (In this case all.)
  462. Executing the graph
  463. -------------------
  464. To execute a graph, the engine walks the activities in the graph xml and creates, in memory, a graph of implementation
  465. activities.
  466. For each activity, the name of the helper factory is calculated from the activity number (e.g. fAc2 for
  467. activity 2). That function is imported from the loaded dll, and then called to create an instance of the generated helper
  468. class - in this case cAc2.
  469. The engine then creates an instance of the class for implementing the activity, and passes the previously created helper
  470. object to the constructor. The engine uses the _kind attribute in the graph to determine which activity class should
  471. be used. E.g. In the example above activity 2 has a _kind of 77, which corresponds to TAKindexread. For an index-read
  472. activity roxie will create an instance of CRoxieServerIndexReadActivity. (The generated helper that is passed to the
  473. activity instance will implement IHThorIndexReadArg). The activity implementations may also extract other information
  474. from the xml for the activity - e.g. hints. Once all the activities are created the edge information is used to link
  475. inputs activities to output activities and add other dependencies.
  476. Note: Any subgraph that is marked with <att name="rootGraph" value="1"/> is a root subgraph. An activity within
  477. a subgraph that has no outputs is called a 'sink' (and an activity without any inputs is called a 'source').
  478. Executing a graph involves executing all the root subgraphs that it contains. All
  479. dependencies of the activities within the subgraph must be executed before a subgraph is executed.
  480. To execute a subgraph, the engine executes each of the sink activities
  481. on separate threads, and then waits for each of those threads to complete.
  482. Each sink activity lazily pulls input rows on demand from activities further up the graph, processes them and returns
  483. when complete.
  484. (If you examine the code you will find that this is a simplification. The implementation for processing dependencies is more
  485. fine grained to ensure IF datasets, OUPUT(,UPDATE) and other ECL constructs are executed correctly.)
  486. In our example the execution flows as follows:
  487. 1. Only a single root subgraph, so need to execute that.
  488. 2. The engine will execute activity 3 - the workunit-write activity (TAKworkunitwrite).
  489. 3. The workunit-write activity will start its input.
  490. 4. The index-read activity will call the helper functions to obtain the filename, resolve the index, and create the filter.
  491. 5. The workunit-write activity requests a row from its input.
  492. 6. The index-read finds the first row, and calls the helper's transform() method to create an output row.
  493. 7. The workunit-write activity persists the row to a buffer (using the serializer provided by the IOutputMetaData interface
  494. implemented by the class mx1).
  495. 8. Back to step 5, workunit-write reading a row from its input, until end of file is returned (notified as two consecutive
  496. NULL rows.
  497. 9. Workunit-write commits the results and finishes.
  498. The execution generally switches back and forth between the code in the engines, and the members of the generated helper
  499. classes.
  500. There are some other details of query execution that are worth highlighting:
  501. Child Queries
  502. Some activities perform complicated operations on child datasets of the input rows. E.g. remove all duplicate
  503. people who are marked as living at this address. This will create a "child query" in the graph - i.e. a
  504. nested graph within a subgraph, which may be executed each time a new input row is processed by the containing
  505. activity.
  506. (The graph of activities for each child query is created at the same time as the parent activity. The activity
  507. instances are reinitialised and re-executed for each input row processed by the parent activity to minimise the
  508. create-time overhead.)
  509. Other helper functions
  510. The generated code contains other functions that are used to describe the meta information for the rows
  511. processed within the graph.
  512. E.g. the following class describes the output from the index read activity::
  513. struct mi1 : public CFixedOutputMetaData {
  514. inline mi1() : CFixedOutputMetaData(120) {}
  515. virtual const RtlTypeInfo * queryTypeInfo() const { return &ty1; }
  516. } mx1;
  517. This represents a fixed size row that occupies 120 bytes. The object returned by the queryTypeInfo() function
  518. provides information about the types of the fields::
  519. const RtlStringTypeInfo ty2(0x4,40);
  520. const RtlFieldStrInfo rf1("name",NULL,&ty2);
  521. const RtlStringTypeInfo ty3(0x4,80);
  522. const RtlFieldStrInfo rf2("address",NULL,&ty3);
  523. const RtlFieldInfo * const tl4[] = { &rf1,&rf2, 0 };
  524. const RtlRecordTypeInfo ty1(0xd,120,tl4);
  525. I.e. a string column, length 40 called "name", followed by a string column, length 80 called "address". The
  526. interface IOutputMetaData in eclhelper.hpp is key to understanding how the rows are processed.
  527. Inline dataset operations
  528. The rule mentioned at the start - that the generated code does not contain any knowledge of how to perform a
  529. particular dataset operation - does have one notable exception. Some operations on child datasets are very
  530. simple to implement, and more efficient if they are implemented using inline C++ code. (The generated code
  531. is smaller, and they avoid the overhead of setting up a child graph.) Examples include filtering and aggregating
  532. column values from a child dataset.
  533. The full code in the different engines is more complicated than the simplified process outlined above, especially
  534. when it comes to executing dependencies, but the broad outline is the same.
  535. Appendix
  536. ========
  537. More information on the work done in the code generator to create the workunit can be found in ecl/eclcc/DOCUMENTATION.rst.
  538. The C++ code can be generated as a single C++ file or multiple files. The system defaults to multiple C++ files, so that
  539. they can be compiled in parallel (and to avoid problems some compilers have with very large files). When multipe C++ files are
  540. generated the metadata classes and workflow classes are generated in the main module, and the activities are generated
  541. in the files suffixed with a number. It may be easier to understand the generated code if it is in one place. In which case,
  542. compile your query with the option -fspanMultipleCpp=0. Use -fsaveCppTempFiles to ensure the C++ files are not deleted
  543. (the C++ files will appear as helpers in the workunit details).
  544. Key types and interfaces from eclhelper.hpp
  545. -------------------------------------------
  546. IEclProcess
  547. The interface that is used by the workflow engine to execute the different workflow items in the generated code.
  548. ThorActivityKind
  549. This enumeration contains one entry for each activity supported by the engines.
  550. ICodeContext
  551. This interface is implemented by the engine, and provides a mechanism for the generated code to call back into the
  552. engine. For example resolveChildQuery() is used to obtain a reference to a child query that can then be executed later.
  553. IOutputMetaData
  554. This interface is used to describe any meta data associated with the data being processed by the queries.
  555. IHThorArg
  556. The base interface for defining information about an activity. Each activity has an associated interface that is derived
  557. from this interface. E.g. each instance of the sort activity will have a helper class implementing IHThorSortArg in
  558. the generated query. There is normally a corresponding base class for each interface in eclhelper_base.hpp that is
  559. used by the generated code e.g. CThorSortArg.
  560. ARowBuilder
  561. This abstract base class is used by the transform functions to reserve memory for the rows that are created.
  562. IEngineRowAllocator
  563. Used by the generated code to allocate rows and rowsets. Can also be used to release rows (or call the global function rtlReleaseRow()).
  564. IGlobalCodeContext
  565. Provides access to functions that cannot be called inside a graph - i.e. can only be called from the global workflow code. Most functions
  566. are related to the internal implementation of particular workflow item types (e.g. persists).
  567. Glossary
  568. --------
  569. activity
  570. An activity is the basic unit of dataset processing implemented by the engines. Each activity corresponds to a node
  571. in the thor execution graph. Instances of the activities are connnected together to create the graph.
  572. dll
  573. A dynamically loaded library. These correspond to shared objects in Linux (extension '.so'), dynamic libraries
  574. in Max OS X ('.dylib'), and dynamic link libraries in windows ('.dll').
  575. superfile
  576. A composite file which allows a collection of files to be treated as a single compound file.
  577. ?What else should go here?
  578. Full text of the workunit XML
  579. -----------------------------
  580. The XML for a workunit can be viewed on the XML tag in eclwatch, or generated by compiling the ECL using the -wu option
  581. with eclcc. Alternatively eclcc -b -S can be used to generate the XML and the C++ at the same time (the output filenames
  582. are derived from the input name).
  583. ::
  584. <W_LOCAL buildVersion="internal_5.3.0-closedown0"
  585. cloneable="1"
  586. codeVersion="157"
  587. eclVersion="5.3.0"
  588. hash="2344844820"
  589. state="completed"
  590. xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance">
  591. <Debug>
  592. <addtimingtoworkunit>0</addtimingtoworkunit>
  593. <debugnlp>1</debugnlp>
  594. <expandpersistinputdependencies>1</expandpersistinputdependencies>
  595. <forcegenerate>1</forcegenerate>
  596. <noterecordsizeingraph>1</noterecordsizeingraph>
  597. <regressiontest>1</regressiontest>
  598. <showmetaingraph>1</showmetaingraph>
  599. <showrecordcountingraph>1</showrecordcountingraph>
  600. <spanmultiplecpp>0</spanmultiplecpp>
  601. <targetclustertype>hthor</targetclustertype>
  602. </Debug>
  603. <Graphs>
  604. <Graph name="graph1" type="activities">
  605. <xgmml>
  606. <graph wfid="2">
  607. <node id="1">
  608. <att>
  609. <graph>
  610. <att name="rootGraph" value="1"/>
  611. <edge id="2_0" source="2" target="3"/>
  612. <node id="2" label="Index Read&#10;&apos;names&apos;">
  613. <att name="definition" value="workuniteg1.ecl(3,1)"/>
  614. <att name="name" value="results"/>
  615. <att name="_kind" value="77"/>
  616. <att name="ecl" value="INDEX({ string40 name, string80 address }, &apos;names&apos;, fileposition(false));&#10;FILTER(KEYED(name = STORED(&apos;searchname&apos;)));&#10;"/>
  617. <att name="recordSize" value="120"/>
  618. <att name="predictedCount" value="0..?[disk]"/>
  619. <att name="_fileName" value="names"/>
  620. </node>
  621. <node id="3" label="Output&#10;Result #1">
  622. <att name="definition" value="workuniteg1.ecl(4,1)"/>
  623. <att name="_kind" value="16"/>
  624. <att name="ecl" value="OUTPUT(..., workunit);&#10;"/>
  625. <att name="recordSize" value="120"/>
  626. </node>
  627. </graph>
  628. </att>
  629. </node>
  630. </graph>
  631. </xgmml>
  632. </Graph>
  633. </Graphs>
  634. <Query fetchEntire="1">
  635. <Associated>
  636. <File desc="workuniteg1.ecl.cpp"
  637. filename="c:\regressout\workuniteg1.ecl.cpp"
  638. ip="10.121.159.73"
  639. type="cpp"/>
  640. </Associated>
  641. </Query>
  642. <Results>
  643. <Result isScalar="0"
  644. name="Result 1"
  645. recordSizeEntry="mf1"
  646. rowLimit="-1"
  647. sequence="0">
  648. <SchemaRaw xsi:type="SOAP-ENC:base64">
  649. name&#xe000;&#xe004;(&#xe000;&#xe000;&#xe000;&#xe001;ascii&#xe000;&#xe001;ascii&#xe000;address&#xe000;&#xe004;P&#xe000;&#xe000;&#xe000;&#xe001;ascii&#xe000;&#xe001;ascii&#xe000;&#xe000;&#xe018;%&#xe000;&#xe000;&#xe000;{ string40 name, string80 address };&#10; </SchemaRaw>
  650. </Result>
  651. <Result name="Result 2" sequence="1">
  652. <SchemaRaw xsi:type="SOAP-ENC:base64">
  653. Result_2&#xe000;&#xe004;&#241;&#255;&#255;&#255;&#xe001;ascii&#xe000;&#xe001;ascii&#xe000;&#xe000;&#xe018;&#xe000;&#xe000;&#xe000;&#xe000; </SchemaRaw>
  654. </Result>
  655. </Results>
  656. <Statistics>
  657. <Statistic c="eclcc"
  658. count="1"
  659. creator="eclcc"
  660. kind="SizePeakMemory"
  661. s="compile"
  662. scope="compile"
  663. ts="1428933081084000"
  664. unit="sz"
  665. value="27885568"/>
  666. </Statistics>
  667. <Variables>
  668. <Variable name="searchname">
  669. <SchemaRaw xsi:type="SOAP-ENC:base64">
  670. searchname&#xe000;&#xe004;&#241;&#255;&#255;&#255;&#xe001;ascii&#xe000;&#xe001;ascii&#xe000;&#xe000;&#xe018;&#xe000;&#xe000;&#xe000;&#xe000; </SchemaRaw>
  671. </Variable>
  672. </Variables>
  673. <Workflow>
  674. <Item mode="normal"
  675. state="null"
  676. type="normal"
  677. wfid="1"/>
  678. <Item mode="normal"
  679. state="reqd"
  680. type="normal"
  681. wfid="2">
  682. <Dependency wfid="1"/>
  683. <Schedule/>
  684. </Item>
  685. </Workflow>
  686. </W_LOCAL>
  687. Full contents of the generated C++ (as a single file)
  688. -----------------------------------------------------
  689. ::
  690. /* Template for generating thor/hthor/roxie output */
  691. #include "eclinclude4.hpp"
  692. #include "eclrtl.hpp"
  693. #include "rtlkey.hpp"
  694. extern RTL_API void rtlStrToStr(size32_t lenTgt,void * tgt,size32_t lenSrc,const void * src);
  695. extern RTL_API int rtlCompareStrStr(size32_t lenL,const char * l,size32_t lenR,const char * r);
  696. const RtlStringTypeInfo ty2(0x4,40);
  697. const RtlFieldStrInfo rf1("name",NULL,&ty2);
  698. const RtlStringTypeInfo ty3(0x4,80);
  699. const RtlFieldStrInfo rf2("address",NULL,&ty3);
  700. const RtlFieldInfo * const tl4[] = { &rf1,&rf2, 0 };
  701. const RtlRecordTypeInfo ty1(0xd,120,tl4);
  702. void getLayout5(size32_t & __lenResult, void * & __result, IResourceContext * ctx) {
  703. rtlStrToDataX(__lenResult,__result,87U,"\000R\000\000\000\001x\000\000\000\002\000\000\000\003\004\000\000\000name\004(\000\000\000\001ascii\000\001ascii\000\000\000\000\000\000\003\007\000\000\000address\004P\000\000\000\001ascii\000\001ascii\000\000\000\000\000\000\002\000\000\000");
  704. }
  705. struct mi1 : public CFixedOutputMetaData {
  706. inline mi1() : CFixedOutputMetaData(120) {}
  707. virtual const RtlTypeInfo * queryTypeInfo() const { return &ty1; }
  708. } mx1;
  709. extern "C" ECL_API IOutputMetaData * mf1() { mx1.Link(); return &mx1; }
  710. struct cAc2 : public CThorIndexReadArg {
  711. virtual unsigned getFormatCrc() {
  712. return 470622073U;
  713. }
  714. virtual bool getIndexLayout(size32_t & __lenResult, void * & __result) { getLayout5(__lenResult, __result, ctx); return true; }
  715. virtual IOutputMetaData * queryDiskRecordSize() { return &mx1; }
  716. virtual IOutputMetaData * queryOutputMeta() { return &mx1; }
  717. virtual void onCreate(ICodeContext * _ctx, IHThorArg *, MemoryBuffer * in) {
  718. ctx = _ctx;
  719. ctx->getResultString(v2,v1.refstr(),"searchname",4294967295U);
  720. }
  721. rtlDataAttr v1;
  722. unsigned v2;
  723. virtual const char * getFileName() {
  724. return "names";
  725. }
  726. virtual void createSegmentMonitors(IIndexReadContext *irc) {
  727. Owned<IStringSet> set3;
  728. set3.setown(createRtlStringSet(40));
  729. char v4[40];
  730. rtlStrToStr(40U,v4,v2,v1.getstr());
  731. if (rtlCompareStrStr(v2,v1.getstr(),40U,v4) == 0) {
  732. set3->addRange(v4,v4);
  733. }
  734. irc->append(createKeySegmentMonitor(false, set3.getClear(), 0, 40));
  735. irc->append(createWildKeySegmentMonitor(40, 80));
  736. }
  737. virtual size32_t transform(ARowBuilder & crSelf, const void * _left) {
  738. crSelf.getSelf();
  739. unsigned char * left = (unsigned char *)_left;
  740. memcpy(crSelf.row() + 0U,left + 0U,120U);
  741. return 120U;
  742. }
  743. };
  744. extern "C" ECL_API IHThorArg * fAc2() { return new cAc2; }
  745. struct cAc3 : public CThorWorkUnitWriteArg {
  746. virtual int getSequence() { return 0; }
  747. virtual IOutputMetaData * queryOutputMeta() { return &mx1; }
  748. virtual void serializeXml(const byte * self, IXmlWriter & out) {
  749. mx1.toXML(self, out);
  750. }
  751. };
  752. extern "C" ECL_API IHThorArg * fAc3() { return new cAc3; }
  753. struct MyEclProcess : public EclProcess {
  754. virtual unsigned getActivityVersion() const { return ACTIVITY_INTERFACE_VERSION; }
  755. virtual int perform(IGlobalCodeContext * gctx, unsigned wfid) {
  756. ICodeContext * ctx;
  757. ctx = gctx->queryCodeContext();
  758. switch (wfid) {
  759. case 1U:
  760. if (!gctx->isResult("searchname",4294967295U)) {
  761. ctx->setResultString("searchname",4294967295U,5U,"Smith");
  762. }
  763. break;
  764. case 2U: {
  765. ctx->executeGraph("graph1",false,0,NULL);
  766. ctx->setResultString(0,1U,5U,"Done!");
  767. }
  768. break;
  769. }
  770. return 2U;
  771. }
  772. };
  773. extern "C" ECL_API IEclProcess* createProcess()
  774. {
  775. return new MyEclProcess;
  776. }