PrG_Simple_Random_Samples.xml 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <!DOCTYPE sect1 PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
  3. "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
  4. <sect1 id="Simple_Random_Samples">
  5. <title>Simple Random Samples</title>
  6. <para>There is a statistical concept called a “Simple Random Sample” in
  7. which a statistically “random” (different from simply using the RANDOM()
  8. function) sample of records is generated from any dataset. The algorithm
  9. implemented in the following code example was provided by a customer.</para>
  10. <para>This code is implemented as a MACRO to allow multiple samples to be
  11. produced in the same workunit (contained in the SimpleRandomSamples.ECL
  12. file):</para>
  13. <programlisting>SimpleRandomSample(InFile,UID_Field,SampleSize,Result) := MACRO
  14. //build a table of the UIDs
  15. #UNIQUENAME(Layout_Plus_RecID)
  16. %Layout_Plus_RecID% := RECORD
  17. UNSIGNED8 RecID := 0;
  18. InFile.UID_Field;
  19. END;
  20. #UNIQUENAME(InTbl)
  21. %InTbl% := TABLE(InFile,%Layout_Plus_RecID%);
  22. //then assign unique record IDs to the table entries
  23. #UNIQUENAME(IDRecs)
  24. %Layout_Plus_RecID% %IDRecs%(%Layout_Plus_RecID% L, INTEGER C) :=
  25. TRANSFORM
  26. SELF.RecID := C;
  27. SELF := L;
  28. END;
  29. #UNIQUENAME(UID_Recs)
  30. %UID_Recs% := PROJECT(%InTbl%,%IDRecs%(LEFT,COUNTER));
  31. //discover the number of records
  32. #UNIQUENAME(WholeSet)
  33. %WholeSet% := COUNT(InFile) : GLOBAL;
  34. //then generate the unique record IDs to include in the sample
  35. #UNIQUENAME(BlankSet)
  36. %BlankSet% := DATASET([{0}],{UNSIGNED8 seq});
  37. #UNIQUENAME(SelectEm)
  38. TYPEOF(%BlankSet%) %SelectEm%(%BlankSet% L, INTEGER c) := TRANSFORM
  39. SELF.seq := ROUNDUP(%WholeSet% * (((RANDOM()%100000)+1)/100000));
  40. END;
  41. #UNIQUENAME(selected)
  42. %selected% := NORMALIZE( %BlankSet%, SampleSize,
  43. %SelectEm%(LEFT, COUNTER));
  44. //then filter the original dataset by the selected UIDs
  45. #UNIQUENAME(SetSelectedRecs)
  46. %SetSelectedRecs% := SET(%UID_Recs%(RecID IN SET(%selected%,seq)),
  47. UID_Field);
  48. result := infile(UID_Field IN %SetSelectedRecs% );
  49. ENDMACRO;
  50. </programlisting>
  51. <para>This MACRO takes four parameters:</para>
  52. <para>* The name of the file to sample * The name of the unique identifier
  53. field in that file * The size of the sample to generate * The name of the
  54. attribute for the result, so that it may be post-processed</para>
  55. <para>The algorithm itself is fairly simple. We first create a TABLE of
  56. uniquely numbered unique identifier fields. Then we use NORMALIZE to produce
  57. a recordset of the candidate records. Which candidate is chosen each time
  58. the TRANSFORM function is called is determined by generating a “random”
  59. value between zero and one, using modulus division by one hundred thousand
  60. on the return from the RANDOM() function, then multiplying that result by
  61. the number of records to sample from, rounding up to the next larger
  62. integer. This determines the position of the field identifier to use. Once
  63. the set of positions within the TABLE is determined, they are used to define
  64. the SET of unique fields to use in the final result.</para>
  65. <para>This algorithm is designed to produce a sample “with replacement” so
  66. that it is possible to have a smaller number of records returned than the
  67. sample size requested. To produce exactly the size sample you need (that is,
  68. a “without replacement” sample), you can request a larger sample size (say,
  69. 10% larger) then use the CHOOSEN function to retrieve only the actual number
  70. of records required, as in this example (also contained in the
  71. SimpleRandomSamples.ECL file).</para>
  72. <programlisting>SomeFile := DATASET([{'A1'},{'B1'},{'C1'},{'D1'},{'E1'},
  73. {'F1'},{'G1'},{'H1'},{'I1'},{'J1'},
  74. {'K1'},{'L1'},{'M1'},{'N1'},{'O1'},
  75. {'P1'},{'Q1'},{'R1'},{'S1'},{'T1'},
  76. {'U1'},{'V1'},{'W1'},{'X1'},{'Y1'},
  77. {'A2'},{'B2'},{'C2'},{'D2'},{'E2'},
  78. {'F2'},{'G2'},{'H2'},{'I2'},{'J2'},
  79. {'K2'},{'L2'},{'M2'},{'N2'},{'O2'},
  80. {'P2'},{'Q2'},{'R2'},{'S2'},{'T2'},
  81. {'U2'},{'V2'},{'W2'},{'X2'},{'Y2'},
  82. {'A3'},{'B3'},{'C3'},{'D3'},{'E3'},
  83. {'F3'},{'G3'},{'H3'},{'I3'},{'J3'},
  84. {'K3'},{'L3'},{'M3'},{'N3'},{'O3'},
  85. {'P3'},{'Q3'},{'R3'},{'S3'},{'T3'},
  86. {'U3'},{'V3'},{'W3'},{'X3'},{'Y3'},
  87. {'A4'},{'B4'},{'C4'},{'D4'},{'E4'},
  88. {'F4'},{'G4'},{'H4'},{'I4'},{'J4'},
  89. {'K4'},{'L4'},{'M4'},{'N4'},{'O4'},
  90. {'P4'},{'Q4'},{'R4'},{'S4'},{'T4'},
  91. {'U4'},{'V4'},{'W4'},{'X4'},{'Y4'}
  92. ],{STRING2 Letter});
  93. ds := DISTRIBUTE(SomeFile,HASH(letter[2]));
  94. SimpleRandomSample(ds,Letter,6, res1) //ask for 6
  95. SimpleRandomSample(ds,Letter,6, res2)
  96. SimpleRandomSample(ds,Letter,6, res3)
  97. OUTPUT(CHOOSEN(res1,5)); //actually need 5
  98. OUTPUT(CHOOSEN(res3,5));
  99. </programlisting>
  100. </sect1>