RONCC
/
Big-Data-HPC-Platform


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
							<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE sect1 PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
"http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
<sect1 id="Simple_Random_Samples">
  <title>Simple Random Samples</title>

  <para>There is a statistical concept called a “Simple Random Sample” in
  which a statistically “random” (different from simply using the RANDOM()
  function) sample of records is generated from any dataset. The algorithm
  implemented in the following code example was provided by a customer.</para>

  <para>This code is implemented as a MACRO to allow multiple samples to be
  produced in the same workunit (contained in the SimpleRandomSamples.ECL
  file):</para>

  <programlisting>SimpleRandomSample(InFile,UID_Field,SampleSize,Result) := MACRO
  //build a table of the UIDs
  #UNIQUENAME(Layout_Plus_RecID)
  %Layout_Plus_RecID% := RECORD
     UNSIGNED8 RecID := 0;
     InFile.UID_Field;
  END;
  #UNIQUENAME(InTbl)
  %InTbl% := TABLE(InFile,%Layout_Plus_RecID%);

  //then assign unique record IDs to the table entries
  #UNIQUENAME(IDRecs)
  %Layout_Plus_RecID% %IDRecs%(%Layout_Plus_RecID% L, INTEGER C) :=
     TRANSFORM
     SELF.RecID := C;
     SELF := L;
  END;
  #UNIQUENAME(UID_Recs)
  %UID_Recs% := PROJECT(%InTbl%,%IDRecs%(LEFT,COUNTER));

   //discover the number of records
   #UNIQUENAME(WholeSet)
   %WholeSet% := COUNT(InFile) : GLOBAL;

   //then generate the unique record IDs to include in the sample
   #UNIQUENAME(BlankSet)
   %BlankSet% := DATASET([{0}],{UNSIGNED8 seq});
   #UNIQUENAME(SelectEm)
   TYPEOF(%BlankSet%) %SelectEm%(%BlankSet% L, INTEGER c) := TRANSFORM
     SELF.seq := ROUNDUP(%WholeSet% * (((RANDOM()%100000)+1)/100000));
   END;
   #UNIQUENAME(selected)
   %selected% := NORMALIZE( %BlankSet%, SampleSize,
                           %SelectEm%(LEFT, COUNTER));

  //then filter the original dataset by the selected UIDs
  #UNIQUENAME(SetSelectedRecs)
%SetSelectedRecs% := SET(%UID_Recs%(RecID IN SET(%selected%,seq)),
                          UID_Field);
  result := infile(UID_Field IN %SetSelectedRecs% );
ENDMACRO;
</programlisting>

  <para>This MACRO takes four parameters:</para>

  <para>* The name of the file to sample * The name of the unique identifier
  field in that file * The size of the sample to generate * The name of the
  attribute for the result, so that it may be post-processed</para>

  <para>The algorithm itself is fairly simple. We first create a TABLE of
  uniquely numbered unique identifier fields. Then we use NORMALIZE to produce
  a recordset of the candidate records. Which candidate is chosen each time
  the TRANSFORM function is called is determined by generating a “random”
  value between zero and one, using modulus division by one hundred thousand
  on the return from the RANDOM() function, then multiplying that result by
  the number of records to sample from, rounding up to the next larger
  integer. This determines the position of the field identifier to use. Once
  the set of positions within the TABLE is determined, they are used to define
  the SET of unique fields to use in the final result.</para>

  <para>This algorithm is designed to produce a sample “with replacement” so
  that it is possible to have a smaller number of records returned than the
  sample size requested. To produce exactly the size sample you need (that is,
  a “without replacement” sample), you can request a larger sample size (say,
  10% larger) then use the CHOOSEN function to retrieve only the actual number
  of records required, as in this example (also contained in the
  SimpleRandomSamples.ECL file).</para>

  <programlisting>SomeFile := DATASET([{'A1'},{'B1'},{'C1'},{'D1'},{'E1'},
                     {'F1'},{'G1'},{'H1'},{'I1'},{'J1'},
                     {'K1'},{'L1'},{'M1'},{'N1'},{'O1'},
                     {'P1'},{'Q1'},{'R1'},{'S1'},{'T1'},
                     {'U1'},{'V1'},{'W1'},{'X1'},{'Y1'},
                     {'A2'},{'B2'},{'C2'},{'D2'},{'E2'},
                     {'F2'},{'G2'},{'H2'},{'I2'},{'J2'},
                     {'K2'},{'L2'},{'M2'},{'N2'},{'O2'},
                     {'P2'},{'Q2'},{'R2'},{'S2'},{'T2'},
                     {'U2'},{'V2'},{'W2'},{'X2'},{'Y2'},
                     {'A3'},{'B3'},{'C3'},{'D3'},{'E3'},
                     {'F3'},{'G3'},{'H3'},{'I3'},{'J3'},
                     {'K3'},{'L3'},{'M3'},{'N3'},{'O3'},
                     {'P3'},{'Q3'},{'R3'},{'S3'},{'T3'},
                     {'U3'},{'V3'},{'W3'},{'X3'},{'Y3'},
                     {'A4'},{'B4'},{'C4'},{'D4'},{'E4'},
                     {'F4'},{'G4'},{'H4'},{'I4'},{'J4'},
                     {'K4'},{'L4'},{'M4'},{'N4'},{'O4'},
                     {'P4'},{'Q4'},{'R4'},{'S4'},{'T4'},
                     {'U4'},{'V4'},{'W4'},{'X4'},{'Y4'}
                     ],{STRING2 Letter});

ds := DISTRIBUTE(SomeFile,HASH(letter[2]));
SimpleRandomSample(ds,Letter,6, res1) //ask for 6
SimpleRandomSample(ds,Letter,6, res2)
SimpleRandomSample(ds,Letter,6, res3)

OUTPUT(CHOOSEN(res1,5)); //actually need 5
OUTPUT(CHOOSEN(res3,5));
</programlisting>
</sect1>