Pārlūkot izejas kodu

Merge branch 'candidate-3.6.x' into candidate-3.8.x

Recent changes on 3.6.x branch (H2H-related) merged to 3.8

Signed-off-by: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 13 gadi atpakaļ
vecāks
revīzija
719005e535

+ 25 - 16
docs/HDFS_Stream/HDFS_Mods/HDFS_Install.xml

@@ -31,14 +31,21 @@
 
         <programlisting>sudo rpm -Uvh &lt;rpm file name&gt;</programlisting>
 
-        <blockquote>
-          <para><emphasis role="bold">NOTE :</emphasis> For ANY version of
-          SuSe you must set a password for the hpcc user on all nodes. One way
-          to do do this is to issue the following command:</para>
+        <variablelist>
+          <varlistentry>
+            <term>Note:</term>
 
-          <para><programlisting>sudo passwd hpcc</programlisting>Be sure to
-          set the password on ALL nodes.</para>
-        </blockquote>
+            <listitem>
+              <para>For ANY version of SuSe you must set a password for the
+              <emphasis role="bluebold">hpcc</emphasis> user on all nodes. One
+              way to do do this is to issue the following command:</para>
+
+              <programlisting>sudo passwd hpcc</programlisting>
+
+              <para>Be sure to set the password on ALL nodes</para>
+            </listitem>
+          </varlistentry>
+        </variablelist>
 
         <para><emphasis role="bold">Ubuntu/Debian </emphasis></para>
 
@@ -54,7 +61,7 @@
     <title>Editing and distributing the Configuration file</title>
 
     <para>After you install the HDFS to HPCC Connector package, you must edit
-    the configuration file and push it out to all nodes. </para>
+    the configuration file and push it out to all nodes.</para>
 
     <orderedlist>
       <listitem>
@@ -63,7 +70,7 @@
 
       <listitem>
         <para>Edit the configuration file <emphasis
-        role="bluebold">/opt/HPCCSystems/etc/HPCCSystems/hdfsstream.conf</emphasis>.</para>
+        role="bluebold">/opt/HPCCSystems/etc/HPCCSystems/hdfsconnector.conf</emphasis>.</para>
 
         <para>The configuration file contains one line:</para>
 
@@ -78,18 +85,18 @@
         push.sh script:</para>
 
         <para><programlisting>sudo -u hpcc /opt/HPCCSystems/sbin/hpcc-push.sh /
-             /opt/HPCCSystems/etc/HPCCSystems/hdfsstream.conf /
-             /opt/HPCCSystems/etc/HPCCSystems/hdfsstream.conf</programlisting></para>
+             /opt/HPCCSystems/etc/HPCCSystems/hdfsconnector.conf /
+             /opt/HPCCSystems/etc/HPCCSystems/hdfsconnector.conf</programlisting></para>
       </listitem>
     </orderedlist>
 
     <para></para>
   </sect2>
 
-  <sect2>
+  <sect2 role="brk">
     <title>Installing the ECL library to your ECL IDE source folder</title>
 
-    <para>The HDSF-to-HPCC Connector library is a single ECL file containing
+    <para>The HDSF to HPCC Connector library is a single ECL file containing
     three MACROs. These steps explain how to install to your ECL source
     repository.</para>
 
@@ -100,8 +107,8 @@
 
         <listitem>
           <para>Extract the contents of the zip file to the ECL IDE source
-          folder. Make sure to enable the option to keep the folder structure
-          within Zip file.</para>
+          folder. Make sure to select the option to use the folder names from
+          the Zip file.</para>
 
           <para>The ECL Source folder is typically located at <emphasis
           role="bluebold">C:\Users\Public\Documents\HPCC Systems\ECL\My
@@ -112,7 +119,9 @@
           Compiler &gt;&gt; ECL Folders</emphasis>.</para>
 
           <para>When you are finished, the library will be in a repository
-          folder named <emphasis role="bluebold">DataStream</emphasis>.</para>
+          folder named <emphasis role="bluebold">DataConnectors</emphasis>. It
+          will contain one file named
+          <emphasis>HDFSConnector.ecl</emphasis>.</para>
         </listitem>
       </orderedlist></para>
   </sect2>

+ 93 - 0
docs/HDFSConnector/HDFS_Mods/HDFS_Intro.xml

@@ -0,0 +1,93 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE sect1 PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+"http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<sect1 id="IntroHDFS" role="nobrk">
+  <title>Introduction</title>
+
+  <para>The HDFS to HPCC Connector provides a means to import data from
+  Hadoop's HDFS into an HPCC Systems Thor platform. It also supports exporting
+  the data back to HDFS or exporting and merging it. This allows you to use an
+  HPCC cluster in conjunction with your Hadoop-based cluster.</para>
+
+  <para>The H2H Connector is an add-on to an HPCC Cluster and consists of
+  server-side components and ECL Macros that invoke them.</para>
+
+  <para><itemizedlist>
+      <listitem>
+        <para><emphasis role="bold">Server-side
+        components:</emphasis><itemizedlist>
+            <listitem>
+              <para>The executable ( /opt/HPCCSystems/bin/hdfsconnector
+              )</para>
+            </listitem>
+
+            <listitem>
+              <para>The shell script (/opt/HPCCSystems/bin/hdfspipe)</para>
+            </listitem>
+
+            <listitem>
+              <para>The configuration file
+              (/opt/HPCCSystems/etc/HPCCSystems/hdfsconnector.conf)</para>
+
+              <para>The configuration file contains the location where Hadoop
+              is installed, as shown in the example below: </para>
+
+              <programlisting>HADOOP_LOCATION=/usr/local/hadoop</programlisting>
+
+              <para>This allows access to the libhdfs (API) library.</para>
+
+              <variablelist>
+                <varlistentry>
+                  <term>Note:</term>
+
+                  <listitem>
+                    <para>The HDFS Connector writes log files to a folder
+                    named <emphasis
+                    role="bluebold">mydataconnectors</emphasis> in the the
+                    HPCC log directory (the HPCC log location can be set using
+                    Configuration Manager). </para>
+
+                    <para>The default location is:<programlisting>/var/log/HPCCSystems/mydataconnectors/</programlisting></para>
+
+                    <para>The log files are written following the following
+                    pattern:<programlisting>HDFSCONNECTOR.&lt;nodeid&gt;.&lt;PID&gt;.log</programlisting></para>
+                  </listitem>
+                </varlistentry>
+              </variablelist>
+            </listitem>
+          </itemizedlist></para>
+      </listitem>
+
+      <listitem>
+        <para><emphasis role="bold">ECL Macros
+        (HDFSConnector.ecl)</emphasis></para>
+
+        <itemizedlist>
+          <listitem>
+            <para>HDFSConnector.PipeIn</para>
+
+            <para>Imports data from Hadoop's file system (HDFS) to a Thor
+            Cluster.</para>
+          </listitem>
+
+          <listitem>
+            <para>HDFSConnector.PipeOut</para>
+
+            <para>Exports data from a Thor Cluster to Hadoop's file system
+            (HDFS).</para>
+          </listitem>
+
+          <listitem>
+            <para>HDFSConnector.PipeOutAndMerge</para>
+
+            <para>Exports data from a Thor Cluster to Hadoop's file system
+            (HDFS) and merges the data.</para>
+          </listitem>
+        </itemizedlist>
+      </listitem>
+
+      <listitem>
+        <para>The HDFS to HPCC Connector User's Guide</para>
+      </listitem>
+    </itemizedlist></para>
+</sect1>

+ 19 - 18
docs/HDFS_Stream/HDFS_Mods/HDFS_PipeIn.xml

@@ -2,10 +2,10 @@
 <!DOCTYPE sect1 PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
 "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
 <sect1 id="PIPEIN" role="nobrk">
-  <title>HDFSPipe.PipeIn</title>
+  <title>HDFSConnector.PipeIn</title>
 
-  <para><emphasis role="bold">HDFSPipe.PipeIn </emphasis><emphasis>( ECL_RS,
-  HadoopFileName, Layout, HadoopFileFormat, HDFSHost, HDFSPort
+  <para><emphasis role="bold">HDFSConnector.PipeIn </emphasis><emphasis>(
+  ECL_RS, HadoopFileName, Layout, HadoopFileFormat, HDFSHost, HDFSPort
   )</emphasis></para>
 
   <para><informaltable colsep="0" frame="none" rowsep="0">
@@ -49,20 +49,21 @@
           <row>
             <entry><emphasis>HDFSPort</emphasis></entry>
 
-            <entry>The Hadoop DFS port number.</entry>
+            <entry>The Hadoop NameNode port number.</entry>
           </row>
         </tbody>
       </tgroup>
     </informaltable></para>
 
-  <para>The <emphasis role="bold">HDFSPipe.PipeIn </emphasis>macro is called
-  to pipe in data from the Hadoop file system (HDFS) to a Thor Cluster.</para>
+  <para>The <emphasis role="bold">HDFSConnector.PipeIn </emphasis>macro is
+  called to pipe in data from the Hadoop file system (HDFS) to a Thor
+  Cluster.</para>
 
   <para>Example:</para>
 
   <programlisting>#OPTION('pickBestEngine', 0);
 IMPORT std;
-IMPORT DataStream;
+IMPORT DataConnectors;
 Layout_Flat := RECORD
   STRING10  fname;
   STRING10  lname;
@@ -75,19 +76,19 @@ Layout_Flat := RECORD
   UNSIGNED1 one;
   UNSIGNED8 id;
 END;
-DataStream.HDFSPipe.PipeIn(certrecords, 
-                           '/user/hadoop/test/cert1', 
-                           Layout_Flat, 
-                           FLAT, 
+DataConnectors.HDFSConnector.PipeIn(MyDataFile, 
+                           '/user/hadoop/test/MyData1', 
+                           Layout_Flat, FLAT, 
                            '192.168.56.120', 
-                           54310)
+                           54310);
+OUTPUT(MyDataFile);
 </programlisting>
 
   <?hard-pagebreak ?>
 
   <programlisting>#OPTION('pickBestEngine', 0);
 IMPORT std;
-IMPORT DataStream;
+IMPORT DataConnectors;
 Layout_CSV := RECORD
   STRING10 fname;
   STRING10 lname;
@@ -100,12 +101,12 @@ Layout_CSV := RECORD
   STRING3  one;
   STRING20 id;
 END;
-DataStream.HDFSPipe.PipeIn(certrecords, 
-                           '/user/Administrator/test/cert1', 
-                           Layout_CSV, 
-                           CSV(SEPARATOR('|')), 
+DataConnectors.HDFSConnector.PipeIn(MyDataFile, 
+                           '/user/Administrator/test/MyData1', 
+                           Layout_CSV, CSV(SEPARATOR('|')), 
                            '192.168.56.120', 
-                           54310)
+                           54310);
+OUTPUT(MyDataFile);
 </programlisting>
 
   <para></para>

+ 26 - 25
docs/HDFS_Stream/HDFS_Mods/HDFS_PipeOut.xml

@@ -2,11 +2,11 @@
 <!DOCTYPE sect1 PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
 "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
 <sect1 id="PIPEOUT">
-  <title>HDFSPipe.PipeOut</title>
+  <title>HDFSConnector.PipeOut</title>
 
-  <para><emphasis role="bold">HDFSPipe.PipeOut </emphasis><emphasis> (ECL_RS,
-  HadoopFileName, Layout, HadoopFileFormat, HDFSHost, HDFSPort, HDFSUser
-  )</emphasis></para>
+  <para><emphasis role="bold">HDFSConnector.PipeOut </emphasis><emphasis>
+  (ECL_RS, HadoopFileName, Layout, HadoopFileFormat, HDFSHost, HDFSPort,
+  HDFSUser )</emphasis></para>
 
   <para><informaltable colsep="0" frame="none" rowsep="0">
       <tgroup cols="2">
@@ -18,7 +18,7 @@
           <row>
             <entry><emphasis>ECL_RS</emphasis></entry>
 
-            <entry>The ECL recordset to stream out.</entry>
+            <entry>The ECL recordset to export.</entry>
           </row>
 
           <row>
@@ -48,7 +48,7 @@
           <row>
             <entry><emphasis>HDFSPort</emphasis></entry>
 
-            <entry>The Hadoop DFS port number.</entry>
+            <entry>The Hadoop NameNode port number.</entry>
           </row>
 
           <row>
@@ -62,17 +62,18 @@
       </tgroup>
     </informaltable></para>
 
-  <para>The <emphasis role="bold">HDFSPipe.Pipeout </emphasis>macro writes the
-  given <emphasis>ECL_RS</emphasis> recordset to the target HDFS system in
-  file parts -- one file part for each HPCC Thor node. You can then use other
-  means to merge the file parts or you can use <emphasis
-  role="bold">HDFSPipe.PipeOutAndMerge</emphasis> to do both tasks.</para>
+  <para>The <emphasis role="bold">HDFSConnector.Pipeout </emphasis>macro
+  writes the given <emphasis>ECL_RS</emphasis> recordset to the target HDFS
+  system in file parts -- one file part for each HPCC Thor node. You can then
+  use other means to merge the file parts or you can use <emphasis
+  role="bold">HDFSConnector.PipeOutAndMerge</emphasis> to do both
+  tasks.</para>
 
   <para>Examples:</para>
 
   <programlisting>#OPTION('pickBestEngine', 0);  
 IMPORT std;
-IMPORT DataStream;
+IMPORT DataConnectors;
 Layout_Flat :=RECORD
   STRING10 fname;
   STRING10 lname;
@@ -85,21 +86,21 @@ Layout_Flat :=RECORD
   UNSIGNED1 one;
   UNSIGNED8 id;
 END;
-hpcccertrecords := DATASET('~certification::full_test_distributed',Layout_Flat, FLAT);
+MyDataFile := DATASET('~certification::full_test_distributed',Layout_Flat, FLAT);
 //piping out hpcccertrecords to a flat file in HDFS called /user/hadoop/test/cert1,
-DataStream.HDFSPipe.PipeOut(hpcccertrecords, 
-                            '/user/hadoop/test/cert1',
-                            Layout_Flat, 
-                            FLAT, 
+DataConnectors.HDFSConnector.PipeOut(MyDataFile, 
+                            '/user/hadoop/test/MyData1',
+                            Layout_Flat, FLAT, 
                             '192.168.56.120', 
                             54310 
-                            'hadoop' )</programlisting>
+                            'hadoopusername' );
+</programlisting>
 
   <?hard-pagebreak ?>
 
   <programlisting>#OPTION('pickBestEngine', 0);  
 IMPORT std;
-IMPORT DataStream;
+IMPORT DataConnectors;
 Layout_CSV := RECORD
   STRING10 fname;
   STRING10 lname;
@@ -112,15 +113,15 @@ Layout_CSV := RECORD
   STRING3  one;
   STRING20 id;
 END;
-hpcccertrecords := DATASET('~certification::full_test_distributed',Layout_CSV, CSV);
+MyDataFile := DATASET('~certification::full_test_distributed',Layout_CSV, CSV);
 //piping out hpcccertrecords to a CSV file in HDFS called /user/hadoop/test/cert1,
-DataStream.HDFSPipe.PipeOut(hpcccertrecords, 
-                            '/user/hadoop/test/cert1', 
-                            Layout_CSV, 
-                            CSV, 
+DataConnectors.HDFSConnector.PipeOut(MyDataFile, 
+                            '/user/hadoop/test/MyData1', 
+                            Layout_CSV, CSV, 
                             '192.168.56.120', 
                             54310 
-                            'hadoop' )</programlisting>
+                            'hadoopusername' );
+</programlisting>
 
   <para></para>
 </sect1>

+ 23 - 24
docs/HDFS_Stream/HDFS_Mods/HDFS_PipeOutandMerge.xml

@@ -2,11 +2,12 @@
 <!DOCTYPE sect1 PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
 "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
 <sect1 id="PIPEOUTANDMERGE">
-  <title>HDFSPipe.PipeOutAndMerge</title>
+  <title>HDFSConnector.PipeOutAndMerge</title>
 
-  <para><emphasis role="bold">HDFSPipe.PipeOutAndMerge</emphasis><emphasis>
-  (ECL_RS, HadoopFileName, Layout, HadoopFileFormat, HDFSHost, HDFSPort,
-  HDFSUser )</emphasis></para>
+  <para><emphasis
+  role="bold">HDFSConnector.PipeOutAndMerge</emphasis><emphasis> (ECL_RS,
+  HadoopFileName, Layout, HadoopFileFormat, HDFSHost, HDFSPort, HDFSUser
+  )</emphasis></para>
 
   <para><informaltable colsep="0" frame="none" rowsep="0">
       <tgroup cols="2">
@@ -18,7 +19,7 @@
           <row>
             <entry><emphasis>ECL_RS</emphasis></entry>
 
-            <entry>The ECL recordset to stream out.</entry>
+            <entry>The ECL recordset to export.</entry>
           </row>
 
           <row>
@@ -48,7 +49,7 @@
           <row>
             <entry><emphasis>HDFSPort</emphasis></entry>
 
-            <entry>The Hadoop DFS port number.</entry>
+            <entry>The Hadoop NameNode port number.</entry>
           </row>
 
           <row>
@@ -62,16 +63,16 @@
       </tgroup>
     </informaltable></para>
 
-  <para>The <emphasis role="bold">HDFSPipe.PipeOutAndMerge </emphasis>macro
-  writes the given <emphasis>ECL_RS</emphasis> recordset to the target HDFS
-  system in file parts and merges them together to form a single target file
-  on the HDFS system.</para>
+  <para>The <emphasis role="bold">HDFSConnector.PipeOutAndMerge
+  </emphasis>macro writes the given <emphasis>ECL_RS</emphasis> recordset to
+  the target HDFS system in file parts and merges them together to form a
+  single target file on the HDFS system.</para>
 
   <para>Example:</para>
 
   <programlisting>#OPTION('pickBestEngine', 0);  
 IMPORT std;
-IMPORT DataStream;
+IMPORT DataConnectors;
 Layout_Flat :=RECORD
   STRING10  fname;
   STRING10  lname;
@@ -84,21 +85,20 @@ Layout_Flat :=RECORD
   UNSIGNED1 one;
   UNSIGNED8 id;
 END;
-hpcccertrecords := DATASET('~certification::full_test_distributed',Layout_Flat, FLAT);
-DataStream.HDFSPipe.PipeOutAndMerge(hpcccertrecords, 
-                                    '/user/hadoop/test/cert1', 
-                                    Layout_Flat, 
-                                    FLAT, 
+MyDataFile := DATASET('~certification::full_test_distributed',Layout_Flat, FLAT);
+DataConnectors.HDFSConnector.PipeOutAndMerge(MyDataFile, 
+                                    '/user/hadoop/test/MyData1', 
+                                    Layout_Flat, FLAT, 
                                     '192.168.56.120', 
                                     54310, 
-                                    'hadoop' )
+                                    'hadoopusername' );
 </programlisting>
 
   <?hard-pagebreak ?>
 
   <programlisting>#OPTION('pickBestEngine', 0);  
 IMPORT std;
-IMPORT DataStream;
+IMPORT DataConnectors;
 Layout_CSV := RECORD
   STRING10 fname;
   STRING10 lname;
@@ -111,14 +111,13 @@ Layout_CSV := RECORD
   STRING3  one;
   STRING20 id;
 END;
-hpcccertrecords := DATASET('~certification::full_test_distributed',Layout_CSV, CSV);
-DataStream.HDFSPipe.PipeOutAndMerge(hpcccertrecords, 
-                                    '/user/hadoop/test/cert1', 
-                                    Layout_CSV, 
-                                    CSV, 
+MyDataFile := DATASET('~certification::full_test_distributed',Layout_CSV, CSV);
+DataConnectors.HDFSConnector.PipeOutAndMerge(MyDataFile, 
+                                    '/user/hadoop/test/MyData1', 
+                                    Layout_CSV, CSV, 
                                     '192.168.56.120', 
                                     54310, 
-                                    'hadoop' )
+                                    'hadoopusername' );
 </programlisting>
 
   <para></para>

+ 5 - 5
docs/HDFS_Stream/HDFS_to_HPCC_ConnectorIncluder.xml

@@ -53,11 +53,11 @@
   <chapter>
     <title>HDFS to HPCC Connector</title>
 
-    <xi:include href="HDFS_Stream/HDFS_Mods/HDFS_Intro.xml"
+    <xi:include href="HDFSConnector/HDFS_Mods/HDFS_Intro.xml"
                 xpointer="element(/1)"
                 xmlns:xi="http://www.w3.org/2001/XInclude" />
     
-    <xi:include href="HDFS_Stream/HDFS_Mods/HDFS_Install.xml"
+    <xi:include href="HDFSConnector/HDFS_Mods/HDFS_Install.xml"
                 xpointer="element(/1)"
                 xmlns:xi="http://www.w3.org/2001/XInclude" />                
   </chapter>
@@ -65,15 +65,15 @@
   <chapter>
     <title>ECL Macros</title>
 
-    <xi:include href="HDFS_Stream/HDFS_Mods/HDFS_PipeIn.xml"
+    <xi:include href="HDFSConnector/HDFS_Mods/HDFS_PipeIn.xml"
                 xpointer="element(/1)"
                 xmlns:xi="http://www.w3.org/2001/XInclude" />
 
-    <xi:include href="HDFS_Stream/HDFS_Mods/HDFS_PipeOut.xml"
+    <xi:include href="HDFSConnector/HDFS_Mods/HDFS_PipeOut.xml"
                 xpointer="element(/1)"
                 xmlns:xi="http://www.w3.org/2001/XInclude" />
 
-    <xi:include href="HDFS_Stream/HDFS_Mods/HDFS_PipeOutandMerge.xml"
+    <xi:include href="HDFSConnector/HDFS_Mods/HDFS_PipeOutandMerge.xml"
                 xpointer="element(/1)"
                 xmlns:xi="http://www.w3.org/2001/XInclude" />
   </chapter>

+ 0 - 74
docs/HDFS_Stream/HDFS_Mods/HDFS_Intro.xml

@@ -1,74 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE sect1 PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
-"http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
-<sect1 id="IntroHDFS" role="nobrk">
-  <title>Introduction</title>
-
-  <para>The HDFS to HPCC Connector provides a means to import data from a
-  Hadoop-based HDFS into an HPCC Systems Thor platform. It also supports
-  exporting the data back to the HDFS or exporting and merging it.</para>
-
-  <para>This allows you to use an HPCC cluster in conjunction with your Hadoop
-  based cluster.</para>
-
-  <para>The H2H Connector is an add-on to an HPCC Cluster and consists
-  of:</para>
-
-  <para><itemizedlist>
-      <listitem>
-        <para>The Server-side components:<itemizedlist>
-            <listitem>
-              <para>The executable ( /opt/HPCCSystems/bin/hdfsstream )</para>
-            </listitem>
-
-            <listitem>
-              <para>The shell script (/opt/HPCCSystems/bin/hdfspipe)</para>
-            </listitem>
-
-            <listitem>
-              <para>The configuration file
-              (/opt/HPCCSystems/etc/HPCCSystems/hdfsstream.conf)</para>
-
-              <para>The configuration file contains one line:</para>
-
-              <programlisting>HADOOP_LOCATION=/usr/local/hadoop</programlisting>
-
-              <para>where the value is set to the location where Hadoop is
-              installed. This allows access to the libhdfs (API)
-              library.</para>
-            </listitem>
-          </itemizedlist></para>
-      </listitem>
-
-      <listitem>
-        <para>The ECL Macros (HDFSPipe.ecl)</para>
-
-        <itemizedlist>
-          <listitem>
-            <para>HDFSPipe.PipeIn</para>
-
-            <para>Imports data from the Hadoop file system (HDFS) to a Thor
-            Cluster.</para>
-          </listitem>
-
-          <listitem>
-            <para>HDFSPipe.PipeOut</para>
-
-            <para>Exports data from a Thor Cluster to a Hadoop file system
-            (HDFS).</para>
-          </listitem>
-
-          <listitem>
-            <para>HDFSPipe.PipeOutAndMerge</para>
-
-            <para>Exports data from a Thor Cluster to a Hadoop file system
-            (HDFS) and merges the data.</para>
-          </listitem>
-        </itemizedlist>
-      </listitem>
-
-      <listitem>
-        <para>The HDFS to HPCC Connector User's Guide</para>
-      </listitem>
-    </itemizedlist></para>
-</sect1>

+ 2 - 2
plugins/CMakeLists.txt

@@ -24,6 +24,6 @@ add_subdirectory (parselib)
 add_subdirectory (stringlib)
 add_subdirectory (unicodelib)
 add_subdirectory (workunitservices)
-if (USE_HDFSSTREAM)
-  add_subdirectory (datastream)
+if (USE_HDFSCONNECTOR)
+  add_subdirectory (dataconnectors)
 endif()

+ 5 - 5
plugins/datastream/CMakeLists.txt

@@ -1,8 +1,8 @@
-project (hpccsystems-datastream)
+project (hpccsystems-dataconnector)
 cmake_minimum_required (VERSION 2.6)
 
-set ( HPCC_DATASTREAM_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-set ( HPCC_SOURCE_DIR ${HPCC_DATASTREAM_SOURCE_DIR}/../../)
+set ( HPCC_DATACONNECTOR_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set ( HPCC_SOURCE_DIR ${HPCC_DATACONNECTOR_SOURCE_DIR}/../../)
 include(${HPCC_SOURCE_DIR}/version.cmake)
 
 set ( CMAKE_MODULE_PATH "${HPCC_SOURCE_DIR}/cmake_modules")
@@ -55,7 +55,7 @@ if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
     set(CPACK_STRIP_FILES TRUE)
 endif()
 
-set ( CPACK_INSTALL_CMAKE_PROJECTS "${CMAKE_CURRENT_BINARY_DIR};hdfsstream;ALL;/")
+set ( CPACK_INSTALL_CMAKE_PROJECTS "${CMAKE_CURRENT_BINARY_DIR};hdfsconnector;ALL;/")
 
 if ( CMAKE_SYSTEM MATCHES Linux )
     EXECUTE_PROCESS (
@@ -137,6 +137,6 @@ else()
     message("WARNING: CMAKE 2.8.1 or later required to create RPMs from this project")
 endif()
 
-add_subdirectory (hdfsstream)
+add_subdirectory (hdfsconnector)
 
 INCLUDE(CPack)

+ 12 - 10
plugins/datastream/hdfsstream/CMakeLists.txt

@@ -1,7 +1,7 @@
-project(hdfsstream)
+project(hdfsconnector)
 
-option(USE_HDFSSTREAM "Configure use of hdstream plugin" OFF)
-if ( USE_HDFSSTREAM )
+option(USE_HDFSCONNECTOR "Configure use of hdfs data connector" OFF)
+if ( USE_HDFSCONNECTOR )
 add_subdirectory (ecl)
 	option(HADOOP_PATH "Set the Hadoop path.")
 	if( NOT HADOOP_PATH )
@@ -12,15 +12,17 @@ add_subdirectory (ecl)
 	#generate config for script.
 	#add script processor for vars.
 
-	configure_file("hdfsstream.conf.in" "hdfsstream.conf")
+	set(HPCC_ETC_DIR "${CMAKE_INSTALL_PREFIX}/${OSSDIR}/etc")
+	set(HPCC_CONF_DIR "${CMAKE_INSTALL_PREFIX}/${OSSDIR}${CONFIG_DIR}")
+	set(HDFSCONN_CONF_FILE "hdfsconnector.conf")
 
-	set(HDFSCONFIG "${CMAKE_INSTALL_PREFIX}/${OSSDIR}${CONFIG_DIR}")
+	configure_file("hdfsconnector.conf.in" "hdfsconnector.conf")
 	configure_file("hdfspipe.in" "hdfspipe" @ONLY )
 
 	find_package(JNI REQUIRED)
 	find_package(LIBHDFS REQUIRED)
 
-	set( SRC hdfsstream.cpp )
+	set( SRC hdfsconnector.cpp )
 
 	include_directories (
 					${CMAKE_BINARY_DIR}
@@ -30,13 +32,13 @@ add_subdirectory (ecl)
 					${JAVA_INCLUDE_PATH2}
 					${LIBHDFS_INCLUDE_DIR}	)
 
-	add_executable( hdfsstream ${SRC} )
+	add_executable( hdfsconnector ${SRC} )
 
 	set ( INSTALLDIR "${OSSDIR}/bin")
-	Install ( TARGETS hdfsstream DESTINATION ${INSTALLDIR} COMPONENT Runtime)
+	Install ( TARGETS hdfsconnector DESTINATION ${INSTALLDIR} COMPONENT Runtime)
 	Install ( PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/hdfspipe DESTINATION ${INSTALLDIR} COMPONENT Runtime )
-	Install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/hdfsstream.conf DESTINATION ${HDFSCONFIG} COMPONENT Runtime )
-	target_link_libraries ( hdfsstream
+	Install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/hdfsconnector.conf DESTINATION ${HPCC_CONF_DIR} COMPONENT Runtime )
+	target_link_libraries ( hdfsconnector
 					${JAVA_JVM_LIBRARY}
 					${LIBHDFS_LIBRARIES})
 endif()

+ 1 - 0
plugins/dataconnectors/hdfsconnector/ecl/CMakeLists.txt

@@ -0,0 +1 @@
+Install ( FILES HDFSConnector.ecl DESTINATION "${OSSDIR}/share/DataConnectors" COMPONENT Runtime )

+ 14 - 14
plugins/datastream/hdfsstream/ecl/HDFSPipe.ecl

@@ -1,4 +1,4 @@
-/* HDFSPipe
+/* HDFSConnector
 Pipe data to and from Hadoop
 
 It is necessary to add this option to your workunit:
@@ -19,12 +19,12 @@ enable it add this to hdfs-site.xml:
 
 import std;
 
-EXPORT HDFSPipe := MODULE
+EXPORT HDFSConnector := MODULE
 
     /*
-   * HDFSPipe.PipeIn - this macro to be called by the user to pipe in data from the Hadoop file system (HDFS).
+   * HDFSConnector.PipeIn - this macro to be called by the user to pipe in data from the Hadoop file system (HDFS).
      *
-     * @param ECL_RS            The ECL recordset to stream out.
+     * @param ECL_RS            The ECL recordset to pipe into.
      * @param HadoopFileName    The fully qualified target HDFS file name.
      * @param Layout            The structure which describes the ECL_RS recordset.
      * @param HadoopFileFormat  The Hadoop data file format : FLAT | CSV.
@@ -84,7 +84,7 @@ EXPORT HDFSPipe := MODULE
 				ECL_RS:= PIPE('hdfspipe -si '
 				+ ' -nodeid ' + STD.system.Thorlib.node()
 				+ ' -clustercount ' + STD.system.Thorlib.nodes()
-				+ ' -reclen ' + sizeof(Layout)
+				+ ' -maxlen ' + sizeof(Layout, MAX)
 				+ ' -filename ' + HadoopFileName
 				+ ' -format '	+  %formatstr%[1..3]
 				+ ' -terminator ' + %termcont2%
@@ -95,7 +95,7 @@ EXPORT HDFSPipe := MODULE
 				ECL_RS:= PIPE('hdfspipe -si '
 				+ ' -nodeid ' + STD.system.Thorlib.node()
 				+ ' -clustercount ' + STD.system.Thorlib.nodes()
-				+ ' -reclen ' + sizeof(Layout)
+				+ ' -maxlen ' + sizeof(Layout, MAX)
 				+ ' -filename ' + HadoopFileName
 				+ ' -format '	+  %formatstr%[1..3]
 				+ ' -host ' + HDFSHost	+ ' -port ' + HDSFPort,
@@ -114,10 +114,10 @@ EXPORT HDFSPipe := MODULE
 	ENDMACRO;
 
     /*
-    HadoopPipe.PipeOut - writes the given recordset 'ECL_RS' to the target HDFS system in
+    HDFSConnector.PipeOut - writes the given recordset 'ECL_RS' to the target HDFS system in
                                                 file parts. One file part for each HPCC node.
 
-    ECL_RS              - The ECL recordset to stream out.
+    ECL_RS              - The ECL recordset to pipe out.
     HadoopFileName      - The fully qualified target HDFS file name.
     Layout              - The structure which describes the ECL_RS recordset.
     HadoopFileFormat    - The Hadoop data file format : FLAT | CSV
@@ -128,8 +128,8 @@ EXPORT HDFSPipe := MODULE
 
     Example:
 
-    HadoopPipe.PipeOut(sue, '/user/hadoop/HDFSAccounts', Layout_CSV_Accounts, CSV, '192.168.56.102', '54310', 'hadoop');
-    HadoopPipe.PipeOut(sue, '/user/hadoop/HDFSPersons', Layout_Flat_Persons, FLAT, '192.168.56.102', '54310', 'hadoop');
+    HDFSConnector.PipeOut(sue, '/user/hadoop/HDFSAccounts', Layout_CSV_Accounts, CSV, '192.168.56.102', '54310', 'hadoop');
+    HDFSConnector.PipeOut(sue, '/user/hadoop/HDFSPersons', Layout_Flat_Persons, FLAT, '192.168.56.102', '54310', 'hadoop');
     */
 
 	export PipeOut(ECL_RS, HadoopFileName, Layout, HadoopFileFormat, HDFSHost, HDSFPort, HDFSUser) := MACRO
@@ -159,11 +159,11 @@ EXPORT HDFSPipe := MODULE
 	ENDMACRO;
 
     /*
-    HadoopPipe.PipeOutAndMerge - writes the given recordset 'ECL_RS' to the target HDFS system
+    HDFSConnector.PipeOutAndMerge - writes the given recordset 'ECL_RS' to the target HDFS system
                                                              in file parts and merges them together to form a single target file
                                                              on the HDFS system.
 
-    ECL_RS          - The ECL recordset to stream out.
+    ECL_RS          - The ECL recordset to pipe out.
     HadoopFileName  - The fully qualified target HDFS file name.
     Layout          - The structure which describes the ECL_RS recordset
     HadoopFileFormat- The Hadoop data file format : FLAT | CSV
@@ -174,8 +174,8 @@ EXPORT HDFSPipe := MODULE
 
     Example:
 
-    HadoopPipe.PipeOut(sue, '/user/hadoop/HDFSAccounts', Layout_CSV_Accounts, CSV, '192.168.56.102', '54310', 'hadoop');
-    HadoopPipe.PipeOut(sue, '/user/hadoop/HDFSPersons', Layout_Flat_Persons, FLAT, '192.168.56.102', '54310', 'hadoop');
+    HDFSConnector.PipeOut(sue, '/user/hadoop/HDFSAccounts', Layout_CSV_Accounts, CSV, '192.168.56.102', '54310', 'hadoop');
+    HDFSConnector.PipeOut(sue, '/user/hadoop/HDFSPersons', Layout_Flat_Persons, FLAT, '192.168.56.102', '54310', 'hadoop');
     */
 
 	export PipeOutAndMerge(ECL_RS, HadoopFileName, Layout, HadoopFileFormat, HDFSHost, HDSFPort, HDFSUser) := MACRO

+ 1 - 0
plugins/datastream/hdfsstream/hdfsstream.conf.in

@@ -1 +1,2 @@
 HADOOP_LOCATION=${HADOOP_PATH}
+LOGS_LOCATION=$log

+ 11 - 14
plugins/datastream/hdfsstream/hdfsstream.cpp

@@ -10,7 +10,6 @@ using namespace std;
 using std::string;
 using std::vector;
 
-//#define EOL "\n\r"
 #define EOL "\n"
 
 tOffset getBlockSize(hdfsFS * filefs, const char * filename)
@@ -75,7 +74,7 @@ long getRecordCount(long fsize, int clustersize, int reclen, int nodeid)
 	if ((fsize / reclen) % clustersize >= nodeid + 1)
 	{
 		readSize += 1;
-		fprintf(stderr, "\nThis node will stream one extra rec\n");
+		fprintf(stderr, "\nThis node will pipe one extra rec\n");
 	}
 	return readSize;
 }
@@ -325,7 +324,7 @@ int readXMLOffset(hdfsFS * fs, const char * filename,
 					firstRowfound = strcmp(currentTag.c_str(),
 							openRowTag.c_str()) == 0;
 					if (firstRowfound)
-						fprintf(stderr, "--start streaming tag %s at %lu--\n",
+						fprintf(stderr, "--start piping tag %s at %lu--\n",
 								currentTag.c_str(), currentPos);
 				}
 
@@ -360,7 +359,7 @@ int readXMLOffset(hdfsFS * fs, const char * filename,
 						&& strcmp(currentTag.c_str(), closeRowTag.c_str()) == 0)
 				{
 					fprintf(stdout, "%s", currentTag.c_str());
-					fprintf(stderr, "--stop streaming at %s %lu--\n",
+					fprintf(stderr, "--stop piping at %s %lu--\n",
 							currentTag.c_str(), currentPos);
 					bytesLeft = 0;
 					break;
@@ -412,7 +411,7 @@ int readXMLOffset(hdfsFS * fs, const char * filename,
 
 int readCSVOffset(hdfsFS * fs, const char * filename, unsigned long seekPos,
 		unsigned long readlen, const char * eolseq, unsigned long bufferSize, bool outputTerminator,
-		unsigned long recLen, unsigned long maxlen, const char * quote)
+		unsigned long recLen, unsigned long maxLen, const char * quote)
 {
 	fprintf(stderr, "CSV terminator: \'%s\' and quote: \'%c\'\n", eolseq, quote[0]);
 	unsigned long recsFound = 0;
@@ -514,7 +513,7 @@ int readCSVOffset(hdfsFS * fs, const char * filename, unsigned long seekPos,
 						currentPos = currentPos + eolseqlen - 1;
 						bytesLeft = bytesLeft - eolseqlen;
 
-						fprintf(stderr, "\n--Start streaming: %ld--\n", currentPos);
+						fprintf(stderr, "\n--Start reading: %ld--\n", currentPos);
 
 						firstEOLfound = true;
 						continue;
@@ -534,7 +533,7 @@ int readCSVOffset(hdfsFS * fs, const char * filename, unsigned long seekPos,
 					//fprintf(stderr, "\nrecsfound: %ld", recsFound);
 					if (stopAtNextEOL)
 					{
-						fprintf(stderr, "\n--Stop streaming: %ld--\n", currentPos);
+						fprintf(stderr, "\n--Stop piping: %ld--\n", currentPos);
 						//fprintf(stdout, "%s", eolseq);
 						bytesLeft = 0;
 						break;
@@ -555,7 +554,7 @@ int readCSVOffset(hdfsFS * fs, const char * filename, unsigned long seekPos,
 				}
 			}
 
-			//don't stream until we're beyond the first EOL (if offset = 0 start streaming ASAP)
+			//don't pipe until we're beyond the first EOL (if offset = 0 start piping ASAP)
 			if (firstEOLfound)
 			{
 				fprintf(stdout, "%c", currChar);
@@ -565,7 +564,7 @@ int readCSVOffset(hdfsFS * fs, const char * filename, unsigned long seekPos,
 			{
 				fprintf(stderr, "%c", currChar);
 				bytesLeft--;
-				if(recLen > 0 && currentPos-seekPos > recLen * 100)
+				if(maxLen > 0 && currentPos-seekPos > maxLen * 10)
 				{
 					fprintf(stderr, "\nFirst EOL was not found within the first %lu bytes", currentPos-seekPos);
 					exit(-1);
@@ -575,7 +574,7 @@ int readCSVOffset(hdfsFS * fs, const char * filename, unsigned long seekPos,
 			if (stopAtNextEOL)
 				fprintf(stderr, "%c", currChar);
 
-			// ok, so if bytesLeft <= 0 at this point, we need to keep reading
+			// ok, so if bytesLeft <= 0 at this point, we need to keep piping
 			// IF the last char read was not an EOL char
 			if (bytesLeft <= 0	&& currChar != eolseq[0])
 			{
@@ -623,7 +622,7 @@ int readFileOffset(hdfsFS * fs, const char * filename, tOffset seekPos,
 
 	unsigned long currentPos = seekPos;
 
-	fprintf(stderr, "\n--Start streaming: %ld--\n", currentPos);
+	fprintf(stderr, "\n--Start piping: %ld--\n", currentPos);
 
 	unsigned long bytesLeft = readlen;
 	while(hdfsAvailable(*fs, readFile) && bytesLeft >0)
@@ -821,7 +820,7 @@ int writeFlatOffset(hdfsFS * fs, const char * filename, unsigned nodeid, unsigne
 	size_t totalbytesread = 0;
 	size_t totalbyteswritten = 0;
 
-	fprintf(stderr, "Writing %s to HDFS [.", filepartname);
+	fprintf(stderr, "Writing %s to HDFS.", filepartname);
  	while(!in.eof())
  	{
  		memset(&char_ptr[0], 0, sizeof(char_ptr));
@@ -831,7 +830,6 @@ int writeFlatOffset(hdfsFS * fs, const char * filename, unsigned nodeid, unsigne
  		tSize num_written_bytes = hdfsWrite(*fs, writeFile, (void*)char_ptr, bytesread);
  		totalbyteswritten += num_written_bytes;
 
- 		fprintf(stderr, ".");
  		//Need to figure out how often this should be done
  		//if(totalbyteswritten % )
 
@@ -850,7 +848,6 @@ int writeFlatOffset(hdfsFS * fs, const char * filename, unsigned nodeid, unsigne
  		fprintf(stderr, "Failed to 'flush' %s\n", filepartname);
 		exit(-1);
 	}
- 	fprintf(stderr, "]");
 
 	fprintf(stderr,"\n total read: %lu, total written: %lu\n", totalbytesread, totalbyteswritten);
 

+ 20 - 9
plugins/datastream/hdfsstream/hdfspipe.in

@@ -1,7 +1,9 @@
 #!/bin/bash
 
+source @HPCC_ETC_DIR@/init.d/hpcc_common
+set_environmentvars
 
-source @HDFSCONFIG@/hdfsstream.conf 
+source @HPCC_CONF_DIR@/@HDFSCONN_CONF_FILE@ 
 
 CLASSPATH=$CLASSPATH:$HADOOP_LOCATION/conf
 
@@ -22,7 +24,6 @@ nodeid=0;
 
 for p in $*;
  do
-#   echo "[$p]" >> $LOG;
    if [ "$idfound" = "1" ];
    then
         nodeid=$p;
@@ -33,7 +34,17 @@ for p in $*;
    fi
 done;
 
-LOG=/tmp/HPCC-HadoopStream.log.$nodeid.$PID
+#the log variable is read from the HPCC Platform config
+LOGS_LOCATION=$log
+HDFSCONNLOGLOC=$LOGS_LOCATION/mydataconnectors
+LOG=$HDFSCONNLOGLOC/HDFSCONNECTOR.$nodeid.$PID.log
+
+if [ -e $HDFSCONNLOGLOC ]
+  then
+    echo "log file found"	>> $LOG
+  else
+    mkdir $HDFSCONNLOGLOC
+fi
 
 echo "Script starting"		>> $LOG
 echo "Running as user: $USER"   >> $LOG
@@ -47,10 +58,10 @@ then
 	exit 1;
 elif [ $1 = "-mf" ];
 then
-	/opt/HPCCSystems/bin/hdfsstream "${@}" 2>> $LOG;
+	/opt/HPCCSystems/bin/hdfsconnector "${@}" 2>> $LOG;
 elif [ $1 = "-si" ];
 then
-	/opt/HPCCSystems/bin/hdfsstream  "${@}" 2>> $LOG;
+	/opt/HPCCSystems/bin/hdfsconnector  "${@}" 2>> $LOG;
 elif [ $1 = "-so" ];
 then
 
@@ -68,9 +79,9 @@ then
 
 	ls -l "$HPCCTMPFILE" 				>> $LOG
 
-	echo "calling hdfsstream..." 		>> $LOG
+	echo "calling hdfsconnector..." 		>> $LOG
 
-	/opt/HPCCSystems/bin/hdfsstream "${@}" -pipepath $HPCCTMPFILE  	2>> $LOG
+	/opt/HPCCSystems/bin/hdfsconnector "${@}" -pipepath $HPCCTMPFILE  	2>> $LOG
 
 	echo "write exited with: $?" 			>> $LOG
 elif [ $1 = "-sop" ];
@@ -85,9 +96,9 @@ then
 	then
 		rm -f /tmp/HPCC-FIFO.err.$PID 2> /dev/null
 	else
-		echo "  WARNING (hdfsstream mkfifo) error registered in file: /tmp/HPCC-FIFO.err.$PID " >> $LOG
+		echo "  WARNING (hdfsconnector mkfifo) error registered in file: /tmp/HPCC-FIFO.err.$PID " >> $LOG
 	fi
-	/opt/HPCCSystems/bin/hdfsstream  "${@}" -pipepath $pipepath	2>> $LOG &
+	/opt/HPCCSystems/bin/hdfsconnector  "${@}" -pipepath $pipepath	2>> $LOG &
 	echo "redirecting stdin to named pipe ... " 	>> $LOG
 	cat < /dev/stdin > "$pipepath"			2>> $LOG
 

+ 0 - 1
plugins/datastream/hdfsstream/ecl/CMakeLists.txt

@@ -1 +0,0 @@
-Install ( FILES HDFSPipe.ecl DESTINATION "${OSSDIR}/share/DataStream" COMPONENT Runtime )