12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538 |
- <?xml version="1.0" encoding="UTF-8"?>
- <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
- "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
- <book xml:base="../">
- <title>HPCC System Administrator's Guide</title>
- <bookinfo>
- <title>HPCC System Administrator's Guide</title>
- <mediaobject>
- <imageobject>
- <imagedata fileref="images/redswooshWithLogo3.jpg" />
- </imageobject>
- </mediaobject>
- <author>
- <surname>Boca Raton Documentation Team</surname>
- </author>
- <legalnotice>
- <para>We welcome your comments and feedback about this document via
- email to <email>docfeedback@hpccsystems.com</email></para>
- <para>Please include <emphasis role="bold">Documentation
- Feedback</emphasis> in the subject line and reference the document name,
- page numbers, and current Version Number in the text of the
- message.</para>
- <para>LexisNexis and the Knowledge Burst logo are registered trademarks
- of Reed Elsevier Properties Inc., used under license.</para>
- <para>HPCC Systems<superscript>®</superscript> is a registered trademark
- of LexisNexis Risk Data Management Inc.</para>
- <para>Other products, logos, and services may be trademarks or
- registered trademarks of their respective companies.</para>
- <para>All names and example data used in this manual are fictitious. Any
- similarity to actual persons, living or dead, is purely
- coincidental.</para>
- <para></para>
- </legalnotice>
- <xi:include href="common/Version.xml" xpointer="FooterInfo"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="common/Version.xml" xpointer="DateVer"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <corpname>HPCC Systems<superscript>®</superscript></corpname>
- <xi:include href="common/Version.xml" xpointer="Copyright"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <mediaobject role="logo">
- <imageobject>
- <imagedata fileref="images/LN_Rightjustified.jpg" />
- </imageobject>
- </mediaobject>
- </bookinfo>
- <chapter>
- <title>Introducing HPCC Systems<superscript>®</superscript>
- Administraton</title>
- <sect1 id="HPCC_SysAdminIntro" role="nobrk">
- <title>Introduction</title>
- <para>HPCC (High Performance Computing Cluster) is a massive
- parallel-processing computing platform that solves Big Data
- problems.</para>
- <para>HPCC stores and processes large quantities of data, processing
- billions of records per second using massive parallel processing
- technology. Large amounts of data across disparate data sources can be
- accessed, analyzed, and manipulated in fractions of seconds. HPCC
- functions as both a processing and a distributed data storage
- environment, capable of analyzing terabytes of information.</para>
- </sect1>
- <sect1 id="HPCC_Architectural_Overview">
- <title>Architectural Overview</title>
- <para>An HPCC Systems<superscript>®</superscript> Platform consists of
- the following components: Thor, Roxie, ESP Server, Dali, Sasha, DFU
- Server, and ECLCC Server. LDAP security is optionally available.</para>
- <para><figure>
- <title>HPCC Architectural Diagram</title>
- <mediaobject>
- <imageobject>
- <imagedata fileref="images/SA004.jpg" />
- </imageobject>
- </mediaobject>
- </figure></para>
- <?hard-pagebreak ?>
- <para>Data loading is controlled through the Distributed File Utility
- (DFU) server.</para>
- <para>Data typically arrives on the landing zone (for example, by FTP).
- File movement (across components) is initiated by DFU. Data is copied
- from the landing zone and is distributed (sprayed) to the Data Refinery
- (Thor) by the ECL code. Data can be further processed via ETL (Extract,
- Transform, and Load process) in the refinery.</para>
- <para>A single physical file is distributed into multiple physical files
- across the nodes of a cluster. The aggregate of the physical files
- creates one logical file that is addressed by the ECL code.</para>
- <para><figure>
- <title>Data Processing</title>
- <mediaobject>
- <imageobject>
- <imagedata fileref="images/SA002.jpg" />
- </imageobject>
- </mediaobject>
- </figure></para>
- <para>The data retrieval process (despraying) places the file back on
- the landing zone.</para>
- <sect2 role="brk">
- <title>Clusters</title>
- <para>HPCC environment contains clusters which you define and use
- according to your needs. The types of clusters used in HPCC:</para>
- <sect3>
- <title>Thor</title>
- <para>Data Refinery (Thor) – Used to process every one of billions
- of records in order to create billions of "improved" records. ECL
- Agent (hThor) is also used to process simple jobs that would be an
- inefficient use of the Thor cluster.</para>
- </sect3>
- <sect3>
- <title>Roxie</title>
- <para>Rapid Data Delivery Engine (Roxie) – Used to search quickly
- for a particular record or set of records.</para>
- <para>Queries are compiled and published, usually in ECL Watch. Data
- moves in parallel from Thor nodes to the receiving Roxie nodes.
- Parallel bandwidth utilization improves the speed of putting new
- data into play.</para>
- </sect3>
- <sect3>
- <title>ECL Agent</title>
- <para>The ECL Agent's primary function is to send the job to execute
- on the appropriate cluster. The ECL Agent can act as a single-node
- cluster. That is called spawning an hThor cluster. hThor is used to
- process simple jobs that would otherwise be an inefficient use of
- Thor. For simple tasks, the ECL Agent will make a determination and
- perform the execution itself by acting as an hThor cluster. <figure>
- <title>Clusters</title>
- <mediaobject>
- <imageobject>
- <imagedata fileref="images/SA003.jpg" />
- </imageobject>
- </mediaobject>
- </figure></para>
- </sect3>
- </sect2>
- <sect2 role="brk">
- <title>System Servers</title>
- <para>The System Servers are integral middleware components of an HPCC
- system. They are used to control workflow and intercomponent
- communication.</para>
- <sect3>
- <title>Dali</title>
- <para>Dali is also known as the system data store. It manages
- workunit records, logical file directory, and shared object
- services.</para>
- <para>It maintains the message queues that drive job execution and
- scheduling. It also enforces the all LDAP security
- restrictions.</para>
- </sect3>
- <sect3>
- <title>Sasha</title>
- <para>The Sasha server is a companion “housekeeping” server to the
- Dali server. It works independently of all other components. Sasha’s
- main function is to reduce the stress on the Dali server. Whenever
- possible, Sasha reduces the resource utilization on Dali.</para>
- <para>Sasha archives workunits (including DFU Workunits) which are
- stored in a series of folders.</para>
- <para>Sasha also performs routine housekeeping such as removing
- cached workunits and DFU recovery files.</para>
- </sect3>
- <sect3>
- <title>DFU Server</title>
- <para>DFU server controls the spraying and despraying operations
- used to move data in and out of Thor.</para>
- <para>DFU services are available from: <itemizedlist>
- <listitem>
- <para>Standard libraries in ECL code.</para>
- </listitem>
- <listitem>
- <para>Client interfaces: Eclipse, ECL Playground, ECL IDE, and
- the ECL command line interface.</para>
- </listitem>
- <listitem>
- <para>DFU Plus command line interface.</para>
- </listitem>
- </itemizedlist></para>
- </sect3>
- <sect3>
- <title>ECLCC Server</title>
- <para>ECLCC Server is the compiler that translates ECL code. When
- you submit ECL code, the ECLCC Server generates optimized C++ which
- is then compiled and executed. ECLCC Server controls the whole
- compilation process.</para>
- <para>When you submit workunits for execution on Thor, they are
- first converted to executable code by the ECLCC Server.</para>
- <para>When you submit a workunit to Roxie, code is compiled and
- later published to the Roxie cluster, where it is available to
- execute multiple times.</para>
- <para>ECLCC Server is also used when the ECL IDE requests a syntax
- check.</para>
- <para>ECLCC Server uses a queue to convert workunits one at a time,
- however you can have ECLCC Servers deployed in the system to
- increase throughput and they will automatically load balance as
- required.</para>
- </sect3>
- <sect3>
- <title>ECL Agent</title>
- <para>ECL Agent (hThor) is a single node process for executing
- simple ECL Queries.</para>
- <para>ECL Agent is an execution engine that processes workunits by
- sending them to the appropriate cluster. ECL Agent processes are
- spawned on-demand when you submit a workunit.</para>
- </sect3>
- <sect3>
- <title>ESP Server</title>
- <para>ESP (Enterprise Service Platform) Server is the
- inter-component communication server. ESP Server is a framework that
- allows multiple services to be “plugged in” to provide various types
- of functionality to client applications via multiple
- protocols.</para>
- <para>Examples of services that are plugged into ESP
- include:<itemizedlist>
- <listitem>
- <para><emphasis role="bold">WsECL:</emphasis> Interface to
- published queries on a Roxie, Thor, or hThor cluster.</para>
- </listitem>
- <listitem>
- <para><emphasis role="bold">ECL Watch:</emphasis> A web-based
- query execution, monitoring, and file management interface. It
- can be accessed via the ECL IDE or a web browser. See
- <emphasis>Using ECL Watch</emphasis>.</para>
- </listitem>
- </itemizedlist></para>
- <para>The ESP Server supports both XML and JSON Formats.</para>
- <!--formerly : protocols - HTTP, HTTPS, SOAP, and JSON - -->
- </sect3>
- <sect3>
- <title>LDAP</title>
- <para>You can incorporate a Lightweight Directory Access Protocol
- (LDAP) server to work with Dali to enforce the security restrictions
- for file scopes, workunit scopes, and feature access.</para>
- <para>When LDAP is configured, you need to authenticate when
- accessing ECL Watch, WsECL, ECL IDE, or any other client tools.
- Those credentials are then used to authenticate any requests from
- those tools.</para>
- </sect3>
- <!-- *** COMMENTING OUT WHOLE Of MONITORING SECTION
- <sect3>
- <title>HPCC Reporting</title>
- <para>HPCC leverages the use of Ganglia reporting and monitoring
- components to monitor several aspects of the HPCC System.</para>
- <para>See <emphasis>HPCC Monitoring and Reporting</emphasis> for
- more information on how to add monitoring and reporting to your HPCC
- System.</para>
- <para>More to come***</para>
- </sect3>
- END COMMENT ***-->
- </sect2>
- <sect2>
- <title>Client Interfaces</title>
- <para>The following Client Interfaces are available to interact with
- the HPCC Platform.</para>
- <sect3>
- <title>Eclipse</title>
- <para>With the ECL plug-in for Eclipse, you can use the Eclipse IDE
- to create and execute queries into your data on an HPCC platform
- using Enterprise Control Language (ECL). Eclipse is open-source, and
- multi-platform and it can be used to interface with your data and
- workunits on HPCC. The ECL plug-in for Eclipse is also
- open-source.</para>
- </sect3>
- <sect3>
- <title>ECL IDE</title>
- <para>ECL IDE is a full-featured GUI providing access to your ECL
- code for ECL development. ECL IDE uses various ESP services via
- SOAP.</para>
- <para>The ECL IDE provides access to ECL Definitions to build your
- queries. These definitions are created by coding an expression that
- defines how some calculation or record set derivation is to be done.
- Once defined, they can be used in succeeding ECL definitions.</para>
- </sect3>
- <sect3>
- <title>ECL Watch</title>
- <para>ECL Watch is a web-based query execution, monitoring, and file
- management interface. It can be accessed via ECL IDE, Eclipse, or a
- web browser. ECL Watch allows you to see information about and
- manipulate workunits. It also allows you monitor cluster activity
- and perform other administrative tasks.</para>
- <para>Using ECL Watch you can:<itemizedlist>
- <listitem>
- <para>Browse through previously submitted workunits (WU). You
- can see a visual representation (graphs) of the data flow
- within the WU, complete with statistics which are updated as
- the job progresses.</para>
- </listitem>
- <listitem>
- <para>Search through files and see information including
- record counts and layouts or sample records.</para>
- </listitem>
- <listitem>
- <para>See the status of all system servers.</para>
- </listitem>
- <listitem>
- <para>View log files.</para>
- </listitem>
- <listitem>
- <para>Add users or groups and modify permissions.</para>
- </listitem>
- </itemizedlist></para>
- <para>See the <emphasis>Using ECL Watch </emphasis>Manual for more
- details.</para>
- </sect3>
- <sect3>
- <title><emphasis role="bold">Command Line Tools</emphasis></title>
- <para>Command line tools: <emphasis role="bold">ECL, DFU
- Plus</emphasis>, and <emphasis role="bold">ECL Plus</emphasis>
- provide command line access to functionality provided by the ECL
- Watch web pages. They work by communicating with the corresponding
- ESP service via SOAP.</para>
- <para>See the <emphasis>Client Tools </emphasis>Manual for more
- details.</para>
- </sect3>
- </sect2>
- </sect1>
- <!--Inclusion-from-ClientTool-As-Sect1: REMOVED-->
- </chapter>
- <chapter>
- <title>Hardware and Software Requirements</title>
- <para>This chapter consists of various Hardware and Software requirements
- that HPCC works well on. HPCC is designed to run on commodity hardware,
- which makes building and maintaining large scale (petabytes) clusters
- economically feasible. When planning your cluster hardware, you will need
- to balance a number of considerations.</para>
- <para>This section provides some insight as to what sort of hardware and
- infrastructure optimally HPCC works well on. This is not an exclusive
- comprehensive set of instructions, nor a mandate on what hardware you must
- have. Consider this as a guide to use when looking to implement or scale
- your HPCC system. These suggestions should be taken into consideration for
- your specific enterprise needs.</para>
- <xi:include href="Installing_and_RunningTheHPCCPlatform/Inst-Mods/Hardware.xml"
- xpointer="HW-Switch"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="Installing_and_RunningTheHPCCPlatform/Inst-Mods/Hardware.xml"
- xpointer="HW-LoadBalancer"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="Installing_and_RunningTheHPCCPlatform/Inst-Mods/Hardware.xml"
- xpointer="Nodes-Hardware"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="HPCCSystemAdmin/SA-Mods/SysAdminConfigMod.xml"
- xpointer="System_sizings"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="Installing_and_RunningTheHPCCPlatform/Inst-Mods/Hardware.xml"
- xpointer="Nodes-Software"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="Installing_and_RunningTheHPCCPlatform/Inst-Mods/Hardware.xml"
- xpointer="workstation-requirements"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- </chapter>
- <chapter>
- <title>Hardware and Component Sizing</title>
- <para>This section provides some insight as to what sort of hardware and
- infrastructure optimally HPCC works well on. This is not an exclusive
- comprehensive set of instructions, nor a mandate on what hardware you must
- have. Consider this as a guide to use when looking to implement or scale
- your HPCC system. These suggestions should be taken into consideration for
- your specific enterprise needs.</para>
- <para>HPCC is designed to run on commodity hardware, which makes building
- and maintaining large scale (petabytes) clusters economically feasible.
- When planning your cluster hardware, you will need to balance a number of
- considerations, including fail-over domains and potential performance
- issues. Hardware planning should include distributing HPCC across multiple
- physical hosts, such as a cluster. Generally, one type of best practice is
- to run HPCC processes of a particular type, for example Thor, Roxie, or
- Dali, on a host configured specifically for that type of process.</para>
- <sect1>
- <title>Thor Hardware</title>
- <para>Thor slave nodes require a proper balance of CPU, RAM, network,
- and disk I/O in order to operate most efficiently. A single Thor slave
- node works optimally when allocated 4 CPU cores, 8GB RAM, 1Gb/sec
- network and 200MB/sec sequential read/write disk I/O.</para>
- <para>Hardware architecture can provide higher value within a single
- physical server. In such cases you can use multi-slave to configure your
- larger physical servers to run multiple Thor slave nodes per physical
- server.</para>
- <para>It is important to note that HPCC by nature is a parallel
- processing system and all Thor slave nodes will be exercising at
- precisely the same time. So when allocating more than one HPCC Thor
- slave per physical machine assure that each slave meets the recommended
- requirements.</para>
- <para>For instance, 1 physical server with 48 cores, 96GB RAM, 10Gb/sec
- network and 2GB/sec sequential I/O would be capable of running ten (10)
- HPCC Thor slaves at optimal efficiency. The order of optimization for
- resource usage in a Thor slave node is disk I/O 60%, network 30%, and
- CPU 10%. Any increase in sequential I/O will have the most impact on
- speed, followed by improvements in network, followed by improvements in
- CPU.</para>
- <para>Network architecture is also an important consideration. HPCC Thor
- nodes work optimally in a streamlined network architecture between all
- Thor slave processes.</para>
- <para>RAID is recommended and all RAID levels suitable for sequential
- read/write operations and high availability are acceptable. For example,
- RAID1, RAID10, RAID5 (preferred), and RAID6.</para>
- </sect1>
- <sect1>
- <title>Roxie Hardware Configurations</title>
- <para>HPCC Roxie processes require require a proper, yet different (from
- Thor) balance of CPU, RAM, network, and disk I/O in order to ensure
- efficient operations. A single HPCC Roxie node works optimally when
- allocated 6 or more CPU cores, 24GB RAM, 1Gb/sec network backbone, and
- 400/sec 4k random read IOPS. </para>
- <para>Each HPCC Roxie node is presented two hard drives, each capable of
- 200/sec 4k random seek IOPS. Hard drive recommendations for Roxie
- efficiency are 15K SAS, or SSD. A good rule of thumb is the more random
- read IOPS the better and faster your Roxie will perform.</para>
- <para>Running multiple HPCC Roxie nodes on a single physical server is
- not recommended, except in the cases of virtualization or
- containers.</para>
- </sect1>
- <sect1>
- <title>Dali and Sasha Hardware Configurations</title>
- <para>HPCC Dali processes store cluster metadata in RAM. For optimal
- efficiency, provide at least 48GB of RAM, 6 or more CPU cores, 1Gb/sec
- network interface and a high availability disk for a single HPCC Dali.
- HPCC's Dali processes are one of the few active/passive components.
- Using standard “swinging disk” clustering is recommended for a high
- availability setup. For a single HPCC Dali process, any suitable High
- Availability (HA) RAID level is fine.</para>
- <para>Sasha does not store any data. Sasha reads data from Dali then
- processes it. Sasha does store archived workunits (WUs) on a disk.
- Allocating a larger disk for Sasha reduces the amount of housekeeping
- needed. Since Sasha assists Dali by performing housekeeping, it works
- best when on its own node. You should avoid putting Sasha and Dali on
- the same node.</para>
- </sect1>
- <sect1>
- <title>Other HPCC Components</title>
- <para>ECL Agent, ECLCC Server, DFU Server, the Thor master, and ECL
- Watch are administrative processes which are used for supporting
- components of the main clusters.</para>
- <para>For maximum efficiency you should provide 24GB RAM, 6+ CPU cores,
- 1Gb/sec network and high availability disk(s). These components can be
- made highly available in an active/active fashion.</para>
- </sect1>
- </chapter>
- <chapter id="Routine_Maintenance">
- <title>Routine Maintenance</title>
- <para>In order to ensure that your HPCC system keeps running optimally,
- some care and maintenance is required. The following sections address
- routine maintenance tasks for your HPCC system.</para>
- <!--***SYSTEM HEALTH CHECK UP***TO COME***-->
- <sect1 role="nobrk">
- <title>Back Up Data</title>
- <para>An integral part of routine maintenance is the back up of
- essential data. Devise a back up strategy to meet the needs of your
- organization. This section is not meant to replace your current back up
- strategy, instead this section supplements it by outlining special
- considerations for HPCC Systems<superscript>®</superscript>.</para>
- <sect2>
- <title>Back Up Considerations</title>
- <para>You probably already have some sort of a back up strategy in
- place, by adding HPCC Systems<superscript>®</superscript> into your
- operating environment there are some additional considerations to be
- aware of. The following sections discuss back up considerations for
- the individual HPCC system components.</para>
- <sect3>
- <title>Dali</title>
- <para>Dali can be configured to create its own back up, ideally you
- would want that back up kept on a different server or node. You can
- specify the Dali back up folder location using the Configuration
- Manager. You may want to keep multiple copies that back up, to be
- able to restore to a certain point in time. For example, you may
- want to do daily snapshots, or weekly.</para>
- <para>You may want to keep back up copies at a system level using
- traditional back up methods.</para>
- </sect3>
- <sect3>
- <title>Sasha</title>
- <para>Sasha itself generates no original data but archives workunits
- to disks. Be aware that Sasha can create quite a bit of archive
- data. Once the workunits are archived they are no longer available
- in the Dali data store. The archives can still be retrieved, but
- that archive now becomes the only copy of these workunits.</para>
- <para>If you need high availability for these archived workunits,
- you should back them up at a system level using traditional back up
- methods.</para>
- </sect3>
- <sect3>
- <title>DFU Server</title>
- <para>DFU Server has no data. DFU workunits are stored in Dali until
- they are archived by Sasha.</para>
- </sect3>
- <sect3>
- <title>ECLCC Server</title>
- <para>ECLCC Server stores no data. ECL workunits are stored in Dali
- and archived by Sasha.</para>
- <!--***COMMENT:<para><emphasis role="bold">Note:</emphasis> No compiler is shipped
- with the HPCC System. The ECLCC Server compiles ECL code into C++,
- however you must have a C++ compiler to use on your system. </para> -->
- </sect3>
- <sect3>
- <title>ECL Agent</title>
- <para>ECL Agent stores no data.</para>
- </sect3>
- <sect3>
- <title>ECL Scheduler</title>
- <para>ECL Scheduler stores no data. ECL Workunits are stored in
- Dali.</para>
- </sect3>
- <sect3>
- <title>ESP Server</title>
- <para>ESP Server stores no data. If you are using SSL certificates,
- public and private keys they should be backed up using traditional
- methods.</para>
- </sect3>
- <sect3>
- <title>Thor</title>
- <para>Thor, the data refinery, as one of the critical components of
- HPCC Systems<superscript>®</superscript> needs to be backed up. Back
- up Thor by configuring replication and setting up a nightly back up
- cron task. Back up Thor on demand before and/or after any node swap
- or drive swap if you do not have a RAID configured.</para>
- <para>A very important part of administering Thor is to check the
- logs to ensure the previous back ups completed successfully.</para>
- <para><emphasis role="bold">Backupnode</emphasis></para>
- <para>Backupnode is a tool that is packaged with HPCC. Backupnode
- allows you to back up Thor nodes on demand or in a script. You can
- also use backupnode regularly in a crontab. You would always want to
- run it on the Thor master of that cluster.</para>
- <para>The following example is one suggested way for invoking
- backupnode manually.</para>
- <programlisting> /bin/su - hpcc -c "/opt/HPCCSystems/bin/start_backupnode thor" & </programlisting>
- <para>The command line parameter must match the name of your Thor
- cluster. In your production environment, it is likely that you would
- provide descriptive names for your Thor clusters.</para>
- <para>For example, if your Thor cluster is named thor400_7s, you
- would call start_backupnode thor400_7s.</para>
- <programlisting> /bin/su - hpcc -c "/opt/HPCCSystems/bin/start_backupnode thor400_7s" & </programlisting>
- <para>To run backupnode regularly you could use cron. For example,
- you may want a crontab entry (to back up thor400_7s) set to run at
- 1am daily:</para>
- <programlisting> 0 1 * * * /bin/su - hpcc -c "/opt/HPCCSystems/bin/start_backupnode thor400_7s" & </programlisting>
- <para>Backupnode writes out its activity to a log file. That log can
- be found at /var/log/HPCCSystems/backupnode/MM_DD_YYYY_HH_MM_SS.log
- with the (MM) Month, (DD) Day, (YYYY) 4-digit Year, (HH) Hour, (MM)
- Minutes, and (SS) Seconds of the back up in the log file name. The
- main log file exists on the Thor master node. It shows what nodes it
- is run on and if it finished. You can find other backupnode logs on
- each of the thorslaves showing what files, if any, it needed to
- “restore”.</para>
- <para>It is important to check the logs to ensure the previous back
- ups completed successfully. The following entry is from the
- backupnode log showing that back up completed successfully:</para>
- <programlisting>00000028 2014-02-19 12:01:08 26457 26457 "Completed in 0m 0s with 0 errors"
- 00000029 2014-02-19 12:01:08 26457 26457 "backupnode finished" </programlisting>
- </sect3>
- <sect3>
- <title>Roxie</title>
- <para>Roxie data is protected by three forms of redundancy:</para>
- <itemizedlist>
- <listitem>
- <para><emphasis role="bold">Original Source Data File
- Retention:</emphasis> When a query is published, the data is
- typically copied from a remote site, either a Thor or a Roxie.
- The Thor data can serve as back up, provided it is not removed
- or altered on Thor. Thor data is typically retained for a period
- of time sufficient to serve as a back up copy.</para>
- </listitem>
- <listitem>
- <para><emphasis role="bold">Peer-Node Redundancy:</emphasis>
- Each Slave node typically has one or more peer nodes within its
- cluster. Each peer stores a copy of data files it will
- read.</para>
- </listitem>
- <listitem>
- <para><emphasis role="bold">Sibling Cluster
- Redundancy:</emphasis> Although not required, Roxie may run
- multiple identically-configured Roxie clusters. When two
- clusters are deployed for Production each node has an identical
- twin in terms of queries and/or data stored on the node in the
- other cluster. This configuration provides multiple redundant
- copies of data files. With three sibling Roxie clusters that
- have peer node redundancy, there are always six copies of each
- file part at any given time; eliminating the need to use
- traditional back up procedures for Roxie data files.</para>
- </listitem>
- </itemizedlist>
- </sect3>
- <sect3>
- <title>Landing Zone</title>
- <para>The Landing Zone is used to host incoming and outgoing files.
- This should be treated similarly to an FTP server. Use traditional
- system level back ups.</para>
- </sect3>
- <sect3>
- <title>Misc</title>
- <para>Back up of any additional component add-ons, your environment
- files (environment.xml), or other custom configurations should be
- done according to traditional back up methods.</para>
- </sect3>
- </sect2>
- </sect1>
- <sect1 id="Log_Files">
- <title>Log Files</title>
- <para>You can review system messages and see any error messages as they
- are reported and captured in log files. Log files can help you in
- understanding what is occurring on the system and useful in
- troubleshooting.</para>
- <sect2 id="Component_Logs">
- <title>Component Logs</title>
- <para>There are log files for each component in directories below
- <emphasis role="bold">/var/log/HPCCSystems</emphasis> (default
- location). You can optionally configure the system to write the logs
- in a different directory. You should know where the log files are, and
- refer to the logs first when troubleshooting any issues.</para>
- <para>There are log files which record activity among the various
- components. You can find the log files in subdirectories named
- corresponding to the components that they track. For example, the Thor
- logs would be found in a directory named mythor, the sasha log would
- be in the mysasha directory, the esp log in the myesp
- directory.</para>
- <para>In each of the component subdirectories, there are several log
- files. Most of the log files use a logical naming convention that
- includes the component name, the date, and time in the name of the log
- file. There is also usually a link for the component with a simple
- name, such as esp.log which is a short cut to the latest current log
- file for that component.</para>
- <para>Understanding the log files, and what is normally reported in
- the log files, helps in troubleshooting the HPCC system.</para>
- <para>As part of routine maintenance you may want to back up, archive,
- and remove the older log files.</para>
- </sect2>
- <sect2>
- <title>Accessing Log Files</title>
- <para>You can access and view the log files directly by going to the
- component log directory from a command prompt or a terminal
- application. You can also view the component log files through ECL
- Watch.</para>
- <para>To view logs on ECL Watch, click on the <emphasis
- role="bold">Operations</emphasis> icon, then click on the <emphasis
- role="bold">System Servers</emphasis> link. That opens the System
- Servers page in ECL Watch. There are several HPCC system components
- listed on that page. In the <emphasis role="bold">Directory</emphasis>
- column for each component there is a computer drive icon. Click the
- icon in the row for the component log you wish to view. <figure>
- <title>Logs in ECL Watch</title>
- <mediaobject>
- <imageobject>
- <imagedata fileref="images/SA005.jpg" />
- </imageobject>
- </mediaobject>
- </figure></para>
- <para>You can also view log files from the other links under the
- Operations icon in ECL Watch. <orderedlist>
- <listitem>
- <para>Click on the <emphasis role="bold">Target
- Clusters</emphasis> link to open the tab with links to your
- system's clusters.</para>
- </listitem>
- <listitem>
- <para>Click on the computer drive icon (circled in red in the
- above figure), in the row of the cluster and node of the
- component log you wish to view.</para>
- </listitem>
- </orderedlist></para>
- <para>To view cluster process logs: <orderedlist>
- <listitem>
- <para>Click on the <emphasis role="bold">Cluster
- Processes</emphasis> link to open the tab with links to your
- system's clusters processes.</para>
- </listitem>
- <listitem>
- <para>Click on the cluster process you wish to view more
- information about.</para>
- <para>For example, click on the <emphasis
- role="bold">myroxie</emphasis> link. You will then see a page of
- all that components nodes. You will see computer drive icon, in
- the row of each node. Click that icon to see the logs for the
- cluster process for that node.</para>
- </listitem>
- </orderedlist></para>
- <sect3 id="Workunit_Logs">
- <title>Log files in ECL Workunits</title>
- <para>You can also access the Thor or ECL Agent log files from the
- ECL Workunits. (not available for Roxie workunits) In ECL Watch when
- examining the Workunit details, you will see a <emphasis
- role="bold">Helpers</emphasis> tab. Click on the Helpers tab to
- display the relevant log files for that particular workunit. <figure>
- <title>Logs in ECL Watch Workunits</title>
- <mediaobject>
- <imageobject>
- <imagedata fileref="images/SA006.jpg" />
- </imageobject>
- </mediaobject>
- </figure></para>
- </sect3>
- </sect2>
- </sect1>
- </chapter>
- <xi:include href="HPCCCertify/Cert-Mods/CertPreflight.xml"
- xpointer="Cert_Prelight"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <chapter id="OnDemand_Maintenance">
- <title>System Configuration and Management</title>
- <para>The HPCC system requires configuration. The Configuration Manager
- tool (configmgr) included with the system software is a valuable piece of
- setting up your HPCC system. The Configuration Manager is a graphical tool
- provided that can be used to configure your system. Configuration Manager
- has a wizard that you can run which will easily generate an environment
- file to get you configured, up and running quickly. There is an advanced
- option available through Configuration Manager which allows for a more
- specific configuration, while still using the graphical interface. If
- desired you can edit the environment files using any xml or text editor
- however the file structure must remain valid.</para>
- <para><figure>
- <title>Sample Production Configuration</title>
- <mediaobject>
- <imageobject>
- <imagedata fileref="images/SA008.jpg" />
- </imageobject>
- </mediaobject>
- </figure></para>
- <!--/*Including special SysAdmin Config Module -paras- */-->
- <xi:include href="HPCCSystemAdmin/SA-Mods/SysAdminConfigMod.xml"
- xpointer="cfgmgr_introP0"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="HPCCSystemAdmin/SA-Mods/SysAdminConfigMod.xml"
- xpointer="cfgmgr_introP1"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="HPCCSystemAdmin/SA-Mods/SysAdminConfigMod.xml"
- xpointer="cfgmgr_p1b"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="HPCCSystemAdmin/SA-Mods/SysAdminConfigMod.xml"
- xpointer="cfgmgr_introP2"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <xi:include href="HPCCSystemAdmin/SA-Mods/SysAdminConfigMod.xml"
- xpointer="cfgmgr_introP3"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <!--/*Including special SysAdmin Config Module -Sect1- */-->
- <xi:include href="HPCCSystemAdmin/SA-Mods/SysAdminConfigMod.xml"
- xpointer="configuring-a-multi-node-system"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <sect1>
- <title>Environment.conf</title>
- <para>Another component of HPCC system configuration is the
- environment.conf file. Environment.conf contains some global definitions
- that the configuration manager uses to configure the HPCC system. In
- most cases, the defaults are sufficient.</para>
- <para><informaltable colsep="1" frame="all" rowsep="1">
- <tgroup cols="2">
- <colspec colwidth="49.50pt" />
- <colspec />
- <tbody>
- <row>
- <entry><inlinegraphic fileref="images/caution.png" /></entry>
- <entry><emphasis role="bold">WARNING</emphasis>: These
- settings are essential to proper system operation. Only expert
- level HPCC administrators should attempt to change any aspects
- of this file.</entry>
- </row>
- </tbody>
- </tgroup>
- </informaltable>By default the environment.conf file is
- located:</para>
- <programlisting>/etc/HPCCSystems</programlisting>
- <para>Environment.conf is required upon startup of HPCC. The
- environment.conf is where the HPCC environment file is defined.</para>
- <programlisting>/opt/HPCCSystems/environment.xml</programlisting>
- <para>This is also where the working path is defined.</para>
- <programlisting>path=/opt/HPCCSystems</programlisting>
- <para>The working path is used by several aspects of the application,
- changing this could cause needless complications. By default the
- application installs there, and sets many resources to that as
- well.</para>
- <para>The default envrionment.conf:</para>
- <para><programlisting>## HPCC Systems default environment configuration file
- [DEFAULT SETTINGS]
- configs=/etc/HPCCSystems
- path=/opt/HPCCSystems
- classpath=/opt/HPCCSystems/classes
- runtime=/var/lib/HPCCSystems
- lock=/var/lock/HPCCSystems
- # Supported logging fields: AUD,CLS,DET,MID,TIM,DAT,PID,TID,NOD,JOB,USE,SES,
- # COD,MLT,MCT,NNT,COM,QUO,PFX,ALL,STD
- logfields=TIM+DAT+MLT+MID+PID+TID+COD+QUO+PFX
- pid=/var/run/HPCCSystems
- log=/var/log/HPCCSystems
- user=hpcc
- group=hpcc
- home=/Users
- environment=environment.xml
- sourcedir=/etc/HPCCSystems/source
- blockname=HPCCSystems
- interface=*
- # enable epoll method for notification events (true/false)
- use_epoll=true
- </programlisting></para>
- <sect2>
- <title>Path considerations</title>
- <para>Most of the directories are defined as absolute paths:</para>
- <programlisting>configs=/etc/HPCCSystems
- path=/opt/HPCCSystems
- classpath=/opt/HPCCSystems/classes
- runtime=/var/lib/HPCCSystems
- lock=/var/lock/HPCCSystems</programlisting>
- <para>HPCC will not run properly without the proper paths, and in some
- cases needs the absolute path. If a process or component can't find a
- path you will get an error message such as the following:</para>
- <programlisting>“There are no components configured to run on the node…” </programlisting>
- <para>If the path changes from HPCCSystems, it does NOT change in the
- environment.xml file. Any changes would require manually modifying the
- environment.xml file.</para>
- <para>The log file, <emphasis>hpcc-init.log</emphasis> is written to
- the HPCCSystems path.</para>
- </sect2>
- <sect2>
- <title>Other Environment.conf items</title>
- <para>Some other items used by or referred to in
- environment.conf.<variablelist>
- <varlistentry>
- <term>Use_epoll</term>
- <listitem>
- <para>It is an event mechanism to achieve better performance
- in more demanding applications where number of watched file
- descriptors is large.</para>
- </listitem>
- </varlistentry>
- <varlistentry>
- <term>Logfields</term>
- <listitem>
- <para>Categories available to be logged. These consist of
- Time(TIM), Date(DAT), Process ID (PID), Thread ID (TID),
- etc.</para>
- </listitem>
- </varlistentry>
- <varlistentry>
- <term>Interface</term>
- <listitem>
- <para>In the default environment.conf there is a value for
- interface. The default value for that is:</para>
- <programlisting>interface=*</programlisting>
- <para>The default value of * assigns the interface to an open
- ip address, in any order. Specifying an interface, such as
- Eth0, will assign the specified node as the primary.<!--***Add More info... WHY DOES THIS MATTER?--></para>
- </listitem>
- </varlistentry>
- </variablelist></para>
- </sect2>
- </sect1>
- <!--Inclusions-As-Sect1-->
- <xi:include href="Installing_and_RunningTheHPCCPlatform/Inst-Mods/UserSecurityMaint.xml"
- xpointer="User_Security_Maint"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- <sect1>
- <title>Workunits and Active Directory</title>
- <para>The performance of your system can vary depending on how some
- components interact. One area which could impact performance is the
- relationship with users, groups, and Active Directory. If possible,
- having a separate Active Directory specific to HPCC could be a good
- policy. There have been a few instances where just one Active Directory
- servicing many, diverse applications has been less than optimal.</para>
- <para>HPCC makes setting up your Active Directory OU's relatively easy.
- ESP creates all the OU's for you when it starts up, based on the
- settings you defined in Configuration Manager. You can then start
- Dali/ESP and use ECLWatch to add or modify users or groups.</para>
- <para>You can assign permissions to each user individually, however it
- is more manageable to assign these permissions to groups, and then add
- users to these groups as appropriate. Create a group for developers and
- power users (people with full read/write/delete access), and another
- group for users that only have only read access and perhaps another
- group that has both read and write access. Add any other groups as
- appropriate for your environment. Now you can assign users to their
- appropriate group(s).</para>
- <sect2>
- <title>Active Directory, and LDAP Commonality</title>
- <para>There are components that are common to both Active Directory
- and LDAP. There are a few relevant terms, that may need some further
- explanation. <variablelist>
- <varlistentry>
- <term>filesBasedn</term>
- <listitem>
- <para>Deals with restricting access to files. Also referred to
- as “file scoping“.</para>
- </listitem>
- </varlistentry>
- <varlistentry>
- <term>groupsBasedn</term>
- <listitem>
- <para>Controls the groups associated with the environment. For
- example, administrators, developers, ws_ecl only, etc.</para>
- </listitem>
- </varlistentry>
- <varlistentry>
- <term>modulesBasedn</term>
- <listitem>
- <para>Specific to systems using a legacy central repository
- and controls access to specific modules. Any module you create
- in the application will create an entry in
- Eclwatch>>User/Permissions>>Repository
- Modules</para>
- </listitem>
- </varlistentry>
- <varlistentry>
- <term>sudoersBasedn</term>
- <listitem>
- <para>Deprecated.</para>
- </listitem>
- </varlistentry>
- <varlistentry>
- <term>workunitsBasedn</term>
- <listitem>
- <para>Controls access to workunits.</para>
- </listitem>
- </varlistentry>
- </variablelist></para>
- </sect2>
- </sect1>
- <sect1>
- <title>Data Handling</title>
- <para>When you start working with your HPCC system, you will want to
- have some data on the system to process. Data gets transferred to and
- the HPCC system by a process called a spray. Likewise to get data out
- from an HPCC system it must be desprayed.</para>
- <para>As HPCC is a computer cluster the data gets deployed out over the
- nodes that make up the cluster. A <emphasis>spray</emphasis> or import
- is the relocation of a data file from one location (such as a Landing
- Zone) to a cluster. The term spray was adopted due to the nature of the
- file movement – the file is partitioned across all nodes within a
- cluster.</para>
- <para>A <emphasis>despray</emphasis> or export is the relocation of a
- data file from a Data Refinery cluster to a single machine location
- (such as a Landing Zone). The term despray was adopted due to the nature
- of the file movement – the file is reassembled from its parts on all
- nodes in the cluster and placed in a single file on the
- destination.</para>
- <para>A <emphasis>Landing Zone</emphasis> (or drop zone) is a physical
- storage location defined in your system's environment. There can be one
- or more of these locations defined. A daemon (dafilesrv) must be running
- on that server to enable file sprays and desprays. You can spray or
- despray some files to your landing zone through ECL Watch. To upload
- large files, you will need a tool that supports the secure copy
- protocol, something like a WinSCP.</para>
- <para>For more information about HPCC data handling see the
- <emphasis>HPCC Data Handling</emphasis> and the <emphasis>HPCC Data
- Tutorial</emphasis> documents.</para>
- </sect1>
- <!--add-certify-->
- </chapter>
- <chapter id="Best_Practices_Chapter">
- <title>Best Practices</title>
- <para>This chapter outlines various forms of best practices established by
- long time HPCC users and administrators running HPCC in a high
- availability, demanding production environment. While it is not required
- that you run your environment in this manner, as your specific
- requirements may vary. This section provides some best practice
- recommendations established after several years of running HPCC in a
- demanding, intense, production environment.</para>
- <sect1 id="BP_Cluster_Redundancy" role="nobrk">
- <title>Cluster Redundancy</title>
- <para>There are several aspects of cluster redundancy that should be
- considered when setting up your HPCC system.</para>
- <para><informaltable colsep="1" frame="all" rowsep="1">
- <?dbfo keep-together="always"?>
- <tgroup cols="2">
- <colspec colwidth="49.50pt" />
- <colspec />
- <tbody>
- <row>
- <entry><inlinegraphic fileref="images/tip.jpg" /></entry>
- <entry><para>Make sure you allocate ample resources to your
- key components. Dali is RAM intensive. ECL Agent and ECL
- Server are processor dependent. Thor should have a minimum of
- 4GB RAM per node.</para><para> </para></entry>
- </row>
- </tbody>
- </tgroup>
- </informaltable></para>
- <sect2>
- <title>Dali</title>
- <para>Dali should be run in an active/passive configuration.
- Active/passive meaning you would have two Dalis running, one primary,
- or active, and the other passive. In this scenario all actions are run
- on the active Dali, but duplicated on the passive one. If the active
- Dali fails, then you can fail over to the passive Dali.</para>
- <para>Another suggested best practice is to use standard clustering
- with a quorum and a takeover VIP (a kind of load balancer). If the
- primary Dali fails, you move the VIP and data directory over to the
- passive node and restart the Dali service.</para>
- </sect2>
- <sect2>
- <title>DFU Server</title>
- <para>You can run multiple instances of the DFU Server. You can run
- all instances as active, as opposed to an active/passive
- configuration. There is no need for a load balancer or VIP. Each
- instance routinely queries the Dali for workunits. Should one fail,
- the other(s) will continue to pull new workunits.</para>
- </sect2>
- <sect2>
- <title>ECLCC Server</title>
- <para>You can run multiple active instances of the ECLCC Server for
- redundancy. There is no need for a load balancer or VIP for this
- either. Each instance will routinely check for workunits. Should one
- fail, the other(s) will continue to compile.</para>
- </sect2>
- <sect2>
- <title>ESP/ECL Watch/WsECL</title>
- <para>To establish redundancy, place the ESP Servers in a VIP. For an
- active/active design, you must use a load balancer. For active/passive
- you can use pacemaker/heartbeat. If you run active/active, you should
- maintain a single client's connection to a single server for the life
- of a session for ECL Watch (port 8010). Other services, such as WsECL
- (port 8002) do not require a persistent connection to a single
- server.</para>
- </sect2>
- <sect2>
- <title>ECL Agent</title>
- <para>You can run multiple active instances of the ECL Agent. No need
- for a load balancer or VIP. Each instance routinely queries for
- workunits. Should one fail, the other(s) will continue to pull new
- workunits.</para>
- </sect2>
- <sect2>
- <title>Sasha</title>
- <para>Sasha should be run in an active/passive configuration.
- Active/passive meaning you would have two Sashas configured, one
- primary (active), and the other standing by.</para>
- </sect2>
- <sect2>
- <title>ECL Scheduler</title>
- <para>No need for a load balancer, runs active/active. Each instance
- routinely queries for workunits. Should one fail, the other(s) will
- continue to schdeule workunits.</para>
- </sect2>
- <sect2>
- <title>Thormaster</title>
- <para>Set up Thor in an active/passive configuration. Active/passive
- meaning you would have two instances running, one primary (active),
- and the other passive. No load balancer needed. If the active instance
- fails, then you can fail over to the passive. Failover then uses the
- VIP (a kind of load balancer) to distribute any incoming
- requests.</para>
- </sect2>
- <sect2>
- <title>Dropzone</title>
- <para>This is just a fileserver that runs the dafilesrv process.
- Configure in the same fashion as you would any active/passive file
- server. One primary, or active, and the other passive. No load
- balancer needed. If the active instance fails, then you can fail over
- to the passive.</para>
- </sect2>
- </sect1>
- <sect1 id="BP_High_Availability">
- <title>High Availability and Disaster Recovery</title>
- <para>If you require high availability for your HPCC system, there are
- some additional considerations that you should be aware of. This is not
- comprehensive list, and it is not meant to be step-by-step instructions
- for setting up disaster recovery. Instead this section just provides
- some more information to consider when incorporating HPCC into your
- disaster recovery plan.</para>
- <sect2 id="Thor_HA">
- <title>Thor</title>
- <para>When designing a Thor cluster for high availability, consider
- how it actually works -- a Thor cluster accepts jobs from a job queue.
- If there are two Thor clusters handling the queue, one will continue
- accepting jobs if the other one fails.</para>
- <para>If a single component (thorslave or thormaster) fails, the other
- will continue to process requests. With replication enabled, it will
- be able to read data from the back up location of the broken Thor.
- Other components (such as ECL Server, or ESP) can also have multiple
- instances. The remaining components, such as Dali, or DFU Server, work
- in a traditional shared storage high availability fail over
- model.</para>
- <sect3>
- <title>The Downside</title>
- <para>Costs twice as much initially because you essentially have to
- have two of everything.</para>
- </sect3>
- <sect3>
- <title>The Upside</title>
- <para>Almost 100% of the time you can utilize the additional
- processing capacity. You can run more jobs, have more space,
- etc.</para>
- </sect3>
- <sect3>
- <title>Disaster Recovery concerns</title>
- <para>The important factor to consider for disaster recovery (DR) is
- the bandwidth required to replicate your data. Your network
- administrator should evaluate this aspect carefully.</para>
- <para>If you have tens of gigabytes of delta each day then an rsync
- type replication or some sort of hybrid model should suffice. If you
- have hundreds of gigabytes to petabytes of deltas, the real limit is
- your budget.</para>
- <para>A best practice is to find where the data is the smallest (at
- ingestion, after normalization, at Roxie) and replicate from that
- point and rerun the processing in both locations.</para>
- <para>The key to getting disaster recovery right is to know your
- data flow. For instance, if you are ingesting 20TB of raw data
- daily, then taking that raw data and rolling it up, scoring it,
- indexing it, etc. You would be better off replicating an
- intermediate dataset (that we call base files), rather than
- replicating the large ingest. If the opposite is occurring (small
- daily ingest and then blow the data up in size) – you would be
- better off to ingest the input and then re-run it.</para>
- <para>Thor has the ability to do a “Thor copy” which copies data
- from one cluster to another. You can also do this through ECL code.
- Additionally, you may decide you don’t want, or need to have a “hot”
- DR Thor. In that case, the most common minor disasters cause only a
- relatively brief, less than 1 day disaster. Since Thor is
- responsible for creating data updates it can take a day or a few to
- recover. The data just is not quite as fresh but as long as the
- Roxies are replicated the data is still flowing. In the case of a
- major disaster such as, a major earthquake, a tidal wave, extended
- total power loss, multiple fiber cuts, where the systems will be out
- for a day or more. The likelihood of that occurring may not justify
- the costs of preventing against it. </para>
- </sect3>
- <sect3>
- <title>Conclusion</title>
- <para>Disaster recovery is a calculation. The cost of failure, times
- the likelihood per year of an event occurring, less than or greater
- than the cost to prevent against it. Taking all that into
- consideration can help you to put a sensible DR plan in
- place.</para>
- </sect3>
- </sect2>
- <sect2 id="HA_Roxie">
- <title>Roxie</title>
- <para>In the case of Roxie, a best practice is to have multiple Roxie
- clusters and use a proxy to balance. In case of how to keep the data
- in sync, a pull approach is best. The Roxie automatically pulls the
- data it needs from the “source” listed in the package file. The data
- can also be pulled from another Roxie or a Thor. In most cases you
- would pull to your DR Roxie from the primary Roxie out of the load
- balancer, but it can also pull from a Thor in the primary location as
- well.</para>
- </sect2>
- <sect2 id="HA_Middlewear">
- <title>Middleware</title>
- <para>Replication of some components (ECL Agent, ESP/Eclwatch, DFU
- Server, etc.) are pretty straight forward as they really don’t have
- anything to replicate. Dali is the biggest consideration when it comes
- to replication. In the case of Dali, you have Sasha as the back up
- locally. The Dali files can be replicated using rsync. A better
- approach could be to use a synchronizing device (cluster WAN sync, SAN
- block replication, etc.), and just put the Dali stores on that and
- just allow it replicate as designed.</para>
- <para>There isn't just a one size fits all approach. Special care,
- design, and planning are required to make an effective DR strategy
- that doesn't “over synchronize” across slow WAN links, but still
- provides you with an acceptable level of redundancy for your business
- needs.</para>
- </sect2>
- </sect1>
- <sect1>
- <title>Best Practice Considerations</title>
- <para>There are several other aspects to best practice considerations,
- and these will change with your system requirements. The following
- sections are some best practice considerations for some aspects of the
- HPCC system. Keep in mind that suggested best practices are merely
- suggested and may not be appropriate for your needs. A thorough review
- of the considerations highlighted here can be very helpful if your needs
- align with the stated considerations.</para>
- <!--/*Further elaboration of both User permissions, and permission settings... also some hardware set up best practices. Suggested***/-->
- <sect2>
- <title>Multiple Thors</title>
- <para>You can run multiple Thors on the same physical hardware.
- Multiple Thors on the same hardware are independent and unaware of
- each other. The Thors run jobs as they receive them, regardless of
- what the other(s) is/are doing. The speed of a single job will never
- be faster with multiple Thors, but the throughput can be. You can run
- two Thors picking up jobs from two different queues or the same
- queue.</para>
- <para>The downside of running multiple Thors on the same hardware is
- that the physical memory on the nodes needs to be shared among each of
- the Thors. This needs to be configured per Thor cluster
- definition.</para>
- <para>You must not place multiple Thors on hardware which does not
- have enough CPU cores to support it. You should not have more Thors
- than number of cores. One good rule is to use a formula where the
- number of cores divided by two is the maximum number of Thor clusters
- to use.</para>
- </sect2>
- </sect1>
- <xi:include href="HPCCSystemAdmin/SA-Mods/SysAdminConfigMod.xml"
- xpointer="Sample_Sizings"
- xmlns:xi="http://www.w3.org/2001/XInclude" />
- </chapter>
- <chapter id="Resources">
- <title>System Resources</title>
- <para>There are additional resources available for the HPCC System.</para>
- <sect1 id="HPCC_Resources" role="nobrk">
- <title>HPCC Resources</title>
- <para>The resources link can be found under the Operations Icon link.
- The resources link in ECL Watch provides a link to the HPCC
- Systems<superscript>®</superscript> web portal. Visit the HPCC
- Systems<superscript>®</superscript> Web Portal at <ulink
- url="http://hpccsystems.com/">http://hpccsystems.com/</ulink> for
- software updates, plug-ins, support, documentation, and more. This is
- where you can find resources useful for running and maintaining HPCC on
- the web portal.</para>
- <para>ECL Watch provides a link to the HPCC portal's download page:
- <ulink
- url="http://hpccsystems.com/download">http://hpccsystems.com/download</ulink>.
- This is the page where you can download Installation packages, virtual
- images, source code, documentation, and tutorials.</para>
- </sect1>
- <sect1>
- <title>Additional Resources</title>
- <para>Additional help for Learning ECL is also available. There are
- online courses.</para>
- <para><ulink
- url="https://learn.lexisnexis.com/lexisnexis/resources/courses">https://learn.lexisnexis.com/lexisnexis/resources/courses
- </ulink></para>
- <para>There are training videos online.</para>
- <para><ulink
- url="https://learn.lexisnexis.com/lexisnexis/resources/courses/HPCC/Summit2014/NewECLWatch50Features/NewECLWatch50Features.html">Legacy
- ECL Watch and New 5.0 ECL Watch</ulink></para>
- <para>A quick summary of the differences in the interface, goes into
- particular detail. Helpful for learning how to deploy Roxies.</para>
- </sect1>
- </chapter>
- </book>
|