jlzw.cpp 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. // JLIB LZW compression class
  14. #include "platform.h"
  15. #include "jmisc.hpp"
  16. #include "jlib.hpp"
  17. #include <time.h>
  18. #include "jfile.hpp"
  19. #include "jencrypt.hpp"
  20. #include "jflz.hpp"
  21. #ifdef _WIN32
  22. #include <io.h>
  23. #endif
  24. #include "jlzw.ipp"
  25. #define COMMITTED ((size32_t)-1)
  26. #define BITS_LO 9
  27. #define BITS_HI 15
  28. #define MAX_CODE ((1<<BITS_HI)-1)
  29. #define BUMP_CODE 257
  30. #define FIRST_CODE 258
  31. #define SAFETY_MARGIN 16 // for 15 bits
  32. #define BITS_PER_UNIT 8
  33. #define BITS_ALWAYS 8
  34. #define ALWAYS_MASK ((1<<BITS_ALWAYS)-1)
  35. typedef unsigned long bucket_t;
  36. // typedef long long lbucket_t;
  37. typedef __int64 lbucket_t;
  38. //#define STATS
  39. //#define TEST
  40. #ifdef _DEBUG
  41. #define ASSERT(a) assertex(a)
  42. #else
  43. #define ASSERT(a)
  44. #endif
  45. LZWDictionary::LZWDictionary()
  46. {
  47. curbits = 0;
  48. }
  49. void LZWDictionary::initdict()
  50. {
  51. nextcode = FIRST_CODE;
  52. curbits = BITS_LO;
  53. nextbump = 1<<BITS_LO;
  54. }
  55. bool LZWDictionary::bumpbits()
  56. {
  57. if (curbits==BITS_HI)
  58. return false;
  59. curbits++;
  60. nextbump = 1<<curbits;
  61. return true;
  62. }
  63. #ifdef STATS
  64. static unsigned st_tottimems=0;
  65. static unsigned st_maxtime=0;
  66. static int st_maxtime_writes=0;
  67. static int st_totwrites=0;
  68. static int st_totblocks=0;
  69. static int st_totwritten=0; // in K
  70. static int st_totread=0; // in K
  71. static unsigned st_thistime=0;
  72. static int st_thiswrites=0;
  73. static unsigned st_totbitsize=0;
  74. static unsigned st_totbitsizeuc=0;
  75. #endif
  76. CLZWCompressor::CLZWCompressor(bool _supportbigendian)
  77. {
  78. outbuf = NULL;
  79. outlen = 0;
  80. maxlen = 0;
  81. bufalloc = 0;
  82. inuseflag=0xff;
  83. supportbigendian = _supportbigendian;
  84. }
  85. CLZWCompressor::~CLZWCompressor()
  86. {
  87. if (bufalloc)
  88. free(outbuf);
  89. #ifdef STATS
  90. printf("HLZW STATS:\n");
  91. printf(" st_tottimems = %d\n",st_tottimems);
  92. printf(" st_maxtime = %d\n",st_maxtime);
  93. printf(" st_maxtime_writes = %d\n",st_maxtime_writes);
  94. printf(" st_totwrites = %d\n",st_totwrites);
  95. printf(" st_totblocks = %d\n",st_totblocks);
  96. printf(" st_totwritten = %dK\n",st_totwritten); // in K
  97. printf(" st_totread = %dK\n",st_totread); // in K
  98. printf(" st_totbitsize = %d\n",st_totbitsize);
  99. printf(" st_totbitsizeuc = %d\n",st_totbitsizeuc);
  100. #endif
  101. }
  102. void CLZWCompressor::initdict()
  103. {
  104. dict.initdict();
  105. // use inuseflag rather than clearing as this can take a large proportion of the time
  106. // (e.g. in hozed)
  107. if (inuseflag==0xff) {
  108. memset(dictinuse,0,sizeof(dictinuse));
  109. inuseflag=0;
  110. }
  111. inuseflag++;
  112. }
  113. struct ShiftInfo {
  114. int mask1;
  115. int shift2; // NB right shift, not left
  116. int mask2;
  117. int padding; // make it multiple of 4
  118. };
  119. ShiftInfo ShiftArray[BITS_HI-BITS_ALWAYS+1][BITS_PER_UNIT];
  120. static struct __initShiftArray {
  121. __initShiftArray()
  122. {
  123. for (unsigned numBits = BITS_LO; numBits <= BITS_HI; ++numBits) {
  124. unsigned copyBits = numBits-BITS_ALWAYS;
  125. unsigned mask = (1<<numBits)-1-ALWAYS_MASK;
  126. for (unsigned shift = 0; shift < BITS_PER_UNIT; shift++) {
  127. ShiftInfo & cur = ShiftArray[copyBits][shift];
  128. if (shift + copyBits <= BITS_PER_UNIT) {
  129. cur.mask1 = mask;
  130. cur.shift2 = 0;
  131. cur.mask2 = 0;
  132. }
  133. else {
  134. cur.shift2 = BITS_PER_UNIT + BITS_ALWAYS - shift;
  135. cur.mask1 = (1<<cur.shift2)-1-ALWAYS_MASK;
  136. cur.mask2 = mask - cur.mask1;
  137. }
  138. }
  139. }
  140. }
  141. } __do_initShiftArray;
  142. #define PUTCODE(code) \
  143. { \
  144. register unsigned inbits=code; \
  145. register int shift=curShift; \
  146. int copyBits = dict.curbits - BITS_PER_UNIT; \
  147. \
  148. *(outbytes++) = (unsigned char)(inbits&0xff); \
  149. ShiftInfo & cur = ShiftArray[copyBits][shift]; \
  150. outbitbuf |= (inbits & cur.mask1) >> (BITS_ALWAYS-shift); \
  151. shift += copyBits; \
  152. if (shift >= BITS_PER_UNIT) \
  153. { \
  154. shift -= BITS_PER_UNIT; \
  155. *(outbits++) = outbitbuf; \
  156. if (outbits==outnext) { \
  157. outbytes = outnext; \
  158. outbits = outbytes+BITS_ALWAYS; \
  159. outnext += dict.curbits; \
  160. outlen += dict.curbits; \
  161. ASSERT(shift==0); \
  162. } \
  163. outbitbuf = 0; \
  164. if (shift != 0) \
  165. outbitbuf = (inbits & cur.mask2) >> cur.shift2; \
  166. } \
  167. curShift = shift; \
  168. }
  169. #define GETCODE(ret) \
  170. { \
  171. int shift=curShift; \
  172. int copyBits = dict.curbits - BITS_PER_UNIT; \
  173. \
  174. ret = *(inbytes++); \
  175. ShiftInfo & cur = ShiftArray[copyBits][shift]; \
  176. ret |= (*inbits << (BITS_ALWAYS-shift)) & cur.mask1; \
  177. shift += copyBits; \
  178. if (shift >= BITS_PER_UNIT) \
  179. { \
  180. shift -= BITS_PER_UNIT; \
  181. inbits++; \
  182. if (inbits==innext) { \
  183. inbytes = innext; \
  184. inbits = inbytes+BITS_ALWAYS; \
  185. innext += dict.curbits; \
  186. ASSERT(shift==0); \
  187. } \
  188. if (shift != 0) \
  189. ret |= (*inbits << cur.shift2) & cur.mask2; \
  190. } \
  191. curShift = shift; \
  192. }
  193. void CLZWCompressor::flushbuf()
  194. {
  195. if (outbytes==outnext)
  196. return;
  197. *(outbits++) = outbitbuf;
  198. while (outbits!=outnext) {
  199. *(outbits++) = 0;
  200. }
  201. do {
  202. *(outbytes++) = 0;
  203. } while (outbytes+(dict.curbits-8)!=outnext);
  204. }
  205. void CLZWCompressor::open(void *buf,size32_t max)
  206. {
  207. #ifdef STATS
  208. st_thistime = msTick();
  209. st_thiswrites=0;
  210. #endif
  211. if (buf) {
  212. if (bufalloc) {
  213. free(outbuf);
  214. }
  215. bufalloc = 0;
  216. outbuf = buf;
  217. }
  218. else if (max>bufalloc) {
  219. if (bufalloc)
  220. free(outbuf);
  221. bufalloc = max;
  222. outbuf = malloc(bufalloc);
  223. }
  224. ASSERT(max>SAFETY_MARGIN+sizeof(size32_t)); // minimum required
  225. maxlen=max-SAFETY_MARGIN;
  226. ASSERT(dict.curbits==0); // check for open called twice with no close
  227. initdict();
  228. curcode = -1;
  229. inlen = 0;
  230. inlenblk = COMMITTED;
  231. memset(outbuf,0,sizeof(size32_t));
  232. outlen = sizeof(size32_t)+dict.curbits;
  233. outbytes = (unsigned char *)outbuf+sizeof(size32_t);
  234. outbits = outbytes+8;
  235. outnext = outbytes+dict.curbits;
  236. curShift=0; //outmask = 0x80;
  237. outbitbuf = 0;
  238. }
  239. #define HASHC(code,ch) (((0x01000193*(unsigned)code)^(unsigned char)ch)%LZW_HASH_TABLE_SIZE)
  240. #define BE_MEMCPY4(dst,src) { if (supportbigendian) _WINCPYREV4(dst,src); else memcpy(dst,src,4); }
  241. size32_t CLZWCompressor::write(const void *buf,size32_t buflen)
  242. {
  243. if (!buflen)
  244. return 0;
  245. if (!dict.curbits)
  246. return 0;
  247. unsigned char *in=(unsigned char *)buf;
  248. #ifdef STATS
  249. st_thiswrites++;
  250. #endif
  251. size32_t len=buflen;
  252. if (curcode==-1) {
  253. curcode = *(in++);
  254. len--;
  255. }
  256. while (len--) {
  257. int ch = *(in++);
  258. int index = HASHC(curcode,ch);
  259. for (;;) {
  260. if (dictinuse[index]!=inuseflag) {
  261. dictinuse[index] = inuseflag;
  262. dictcode[index] = dict.nextcode++;
  263. dict.dictparent[index] = curcode;
  264. dict.dictchar[index] = (unsigned char) ch;
  265. PUTCODE(curcode);
  266. if ((outlen>=maxlen)) {
  267. size32_t ret;
  268. if (inlenblk==COMMITTED) {
  269. ret = in-(unsigned char *)buf-1;
  270. inlen += in-(unsigned char *)buf-1;
  271. }
  272. else
  273. ret = 0;
  274. close();
  275. return ret;
  276. }
  277. if (dict.nextcode == dict.nextbump) {
  278. PUTCODE(BUMP_CODE);
  279. flushbuf();
  280. bool eodict = !dict.bumpbits();
  281. if (eodict) {
  282. initdict();
  283. }
  284. outbytes = outnext;
  285. outbits = outbytes+8;
  286. outnext += dict.curbits;
  287. outlen += dict.curbits;
  288. curShift=0;//outmask = 0x80;
  289. outbitbuf = 0;
  290. }
  291. curcode = ch;
  292. break;
  293. }
  294. if (dict.dictparent[index] == curcode &&
  295. dict.dictchar[index] == (unsigned char)ch) {
  296. curcode = dictcode[index];
  297. break;
  298. }
  299. index--;
  300. if (index<0) {
  301. index = LZW_HASH_TABLE_SIZE-1;
  302. }
  303. }
  304. }
  305. inlen += buflen;
  306. return buflen;
  307. }
  308. void CLZWCompressor::startblock()
  309. {
  310. inlenblk = inlen;
  311. }
  312. void CLZWCompressor::commitblock()
  313. {
  314. inlenblk = COMMITTED;
  315. }
  316. void CLZWCompressor::close()
  317. {
  318. if (dict.curbits) {
  319. PUTCODE(curcode);
  320. flushbuf();
  321. dict.curbits = 0;
  322. if (inlenblk!=COMMITTED)
  323. inlen = inlenblk; // transaction failed
  324. inlenblk = COMMITTED;
  325. BE_MEMCPY4(outbuf,&inlen);
  326. #ifdef STATS
  327. unsigned t = (msTick()-st_thistime);
  328. if (t>st_maxtime) {
  329. st_maxtime = t;
  330. st_maxtime_writes = st_thiswrites;
  331. }
  332. st_tottimems += t;
  333. st_totwrites += st_thiswrites;
  334. st_totwritten += (outlen+511)/1024;
  335. st_totread += (inlen+511)/1024;
  336. st_totblocks++;
  337. #endif
  338. }
  339. }
  340. CLZWExpander::CLZWExpander(bool _supportbigendian)
  341. {
  342. outbuf = NULL;
  343. outlen = 0;
  344. outmax = 0;
  345. bufalloc = 0;
  346. supportbigendian = _supportbigendian;
  347. }
  348. CLZWExpander::~CLZWExpander()
  349. {
  350. if (bufalloc)
  351. free(outbuf);
  352. }
  353. size32_t CLZWExpander::init(const void *blk)
  354. {
  355. dict.initdict();
  356. BE_MEMCPY4(&outlen,blk);
  357. inbytes=(unsigned char *)blk+sizeof(size32_t);
  358. inbits=inbytes+8;
  359. innext=inbytes+dict.curbits;
  360. curShift=0;
  361. return outlen;
  362. }
  363. void CLZWExpander::expand(void *buf)
  364. {
  365. if (!outlen)
  366. return;
  367. if (buf) {
  368. if (bufalloc)
  369. free(outbuf);
  370. bufalloc = 0;
  371. outbuf = (unsigned char *)buf;
  372. }
  373. else if (outlen>bufalloc) {
  374. if (bufalloc)
  375. free(outbuf);
  376. bufalloc = outlen;
  377. outbuf = (unsigned char *)malloc(bufalloc);
  378. if (!outbuf)
  379. throw MakeStringException(MSGAUD_operator,0, "Out of memory in LZWExpander::expand, requesting %d bytes", bufalloc);
  380. }
  381. unsigned char *out=outbuf;
  382. unsigned char *outend = out+outlen;
  383. int oldcode ;
  384. GETCODE(oldcode);
  385. register int ch=oldcode;
  386. *(out++)=(unsigned char)ch;
  387. while (out!=outend) {
  388. int newcode;
  389. GETCODE(newcode);
  390. unsigned char *sp = stack;
  391. if (newcode >= dict.nextcode) {
  392. *(sp++) = (unsigned char) ch;
  393. ch = oldcode;
  394. }
  395. else if (newcode == BUMP_CODE) {
  396. bool eodict = !dict.bumpbits();
  397. if (eodict)
  398. dict.initdict();
  399. inbytes = innext;
  400. inbits = inbytes+8;
  401. innext += dict.curbits;
  402. curShift=0;
  403. if (eodict) {
  404. GETCODE(oldcode);
  405. ch=oldcode;
  406. *(out++)=(unsigned char)ch;
  407. }
  408. continue;
  409. }
  410. else
  411. ch = newcode;
  412. while (ch > 255) {
  413. *(sp++) = dict.dictchar[ch];
  414. ch = dict.dictparent[ch];
  415. }
  416. #ifdef _DEBUG
  417. assertex(dict.nextcode <= MAX_CODE);
  418. #endif
  419. dict.dictparent[dict.nextcode] = oldcode;
  420. dict.dictchar[dict.nextcode++] = (unsigned char) ch;
  421. oldcode = newcode;
  422. *(out++) = ch;
  423. while ((sp!=stack)&&(out!=outend)) {
  424. *(out++)=(unsigned char)*(--sp);
  425. }
  426. }
  427. }
  428. // encoding
  429. // 0 = 0
  430. // 10 = 1
  431. // 1100 = 2
  432. // 1101 = 3
  433. // 1110bb = 4-7
  434. // 11110bbbb = 8-23
  435. // 111110bbbbbbbb = 24-279
  436. #define OUTBIT(b) { if (b) bb|=bm; if (bm==0x80) { outp[l++] = bb; bb=0; bm=1; } else bm<<=1; }
  437. size32_t bitcompress(unsigned *p,int n,void *outb)
  438. {
  439. int l=0;
  440. unsigned char *outp=(unsigned char *)outb;
  441. outp[1] = 0;
  442. unsigned char bm=1;
  443. unsigned char bb=0;
  444. while (n--) {
  445. unsigned d=*p;
  446. if (d==0) { // special 0
  447. OUTBIT(0);
  448. }
  449. else if (--d==0) { // special 1
  450. OUTBIT(1);
  451. OUTBIT(0);
  452. }
  453. else {
  454. d--;
  455. unsigned m;
  456. unsigned nb=0;
  457. while (1) {
  458. if (nb==5) {
  459. m = 0x80000000;
  460. nb++;
  461. break;
  462. }
  463. unsigned ntb = 1<<nb;
  464. m = 1<<ntb;
  465. nb++;
  466. if (d<m) {
  467. m>>=1;
  468. break;
  469. }
  470. d-=m;
  471. }
  472. OUTBIT(1);
  473. while (nb--)
  474. OUTBIT(1);
  475. OUTBIT(0);
  476. while (m) {
  477. OUTBIT(m&d);
  478. m>>=1;
  479. }
  480. }
  481. p++;
  482. }
  483. if (bm!=1) {
  484. outp[l++] = bb; // flush remaining bits
  485. }
  486. return l;
  487. }
  488. #define MAX_BUCKETS 1024
  489. ICompressor *createLZWCompressor(bool _supportbigendian)
  490. {
  491. return new CLZWCompressor(_supportbigendian);
  492. }
  493. IExpander *createLZWExpander(bool _supportbigendian)
  494. {
  495. return new CLZWExpander(_supportbigendian);
  496. }
  497. //===========================================================================
  498. /*
  499. RLE
  500. uses <d1-de> 1-15 repeats of prev char
  501. <d0> <rept-15> 15-222 repeats of prev char
  502. <d0> as escape (followed by d0-df)
  503. <d0> <ff> (at start) - plain row following
  504. prev char is initialy assumed 0
  505. */
  506. size32_t RLECompress(void *dst,const void *src,size32_t size) // maximum will write is 2+size
  507. {
  508. if (size==0)
  509. return 0;
  510. byte *out=(byte *)dst;
  511. byte *outmax = out+size;
  512. const byte *in=(const byte *)src;
  513. const byte *inmax = in+size;
  514. byte pc = 0;
  515. loop {
  516. byte c = *(in++);
  517. if (c==pc) {
  518. byte cnt = 0;
  519. do {
  520. cnt++;
  521. if (in==inmax) {
  522. if (cnt<=15)
  523. *(out++) = 0xd0+cnt;
  524. else {
  525. *(out++) = 0xd0;
  526. if (out==outmax)
  527. goto Fail;
  528. *(out++) = cnt-15;
  529. }
  530. return (size32_t)(out-(byte *)dst);
  531. }
  532. c = *(in++);
  533. } while ((c==pc)&&(cnt!=222));
  534. if (cnt<=15)
  535. *(out++) = 0xd0+cnt;
  536. else {
  537. *(out++) = 0xd0;
  538. if (out==outmax)
  539. break; // fail
  540. *(out++) = cnt-15;
  541. }
  542. if (out==outmax)
  543. break;
  544. }
  545. if ((c<0xd0)||(c>=0xe0))
  546. *(out++) = c;
  547. else {
  548. *(out++) = 0xd0;
  549. if (out==outmax)
  550. break; // fail
  551. *(out++) = c;
  552. }
  553. if (in==inmax)
  554. return (size32_t)(out-(byte *)dst);
  555. if (out==outmax)
  556. break; // will need at least one more char
  557. pc = c;
  558. }
  559. Fail:
  560. out=(byte *)dst;
  561. *(out++) = 0xd0;
  562. *(out++) = 0xff;
  563. memcpy(out,src,size);
  564. return size+2;
  565. }
  566. size32_t RLEExpand(void *dst,const void *src,size32_t expsize)
  567. {
  568. if (expsize==0)
  569. return 0;
  570. byte *out=(byte *)dst;
  571. byte *outmax = out+expsize;
  572. const byte *in=(const byte *)src;
  573. byte c = *(in++);
  574. if ((c==0xd0)&&(*in==0xff)) {
  575. memcpy(dst,in+1,expsize);
  576. return expsize+2;
  577. }
  578. byte pc = 0;
  579. loop {
  580. if ((c<0xd0)||(c>=0xe0))
  581. *(out++) = c;
  582. else {
  583. c -= 0xd0;
  584. if (c==0) {
  585. c = *(in++);
  586. if (c>=0xd0) {
  587. *(out++) = c;
  588. if (c>=0xe0)
  589. throw MakeStringException(-1,"Corrupt RLE format");
  590. goto Escape;
  591. }
  592. c+=15;
  593. }
  594. size32_t left = (size32_t)(outmax-out);
  595. size32_t cnt = c;
  596. c = pc;
  597. if (left<cnt)
  598. cnt = left;
  599. while (cnt--)
  600. *(out++) = c;
  601. }
  602. Escape:
  603. if (out==outmax)
  604. break;
  605. pc = c;
  606. c = *(in++);
  607. }
  608. return (size32_t)(in-(const byte *)src);
  609. }
  610. void appendToBuffer(MemoryBuffer & out, size32_t len, const void * src)
  611. {
  612. out.append(false);
  613. out.append(len);
  614. out.append(len, src);
  615. }
  616. void compressToBuffer(MemoryBuffer & out, size32_t len, const void * src)
  617. {
  618. unsigned originalLength = out.length();
  619. out.append(true);
  620. out.append((size32_t)0);
  621. if (len >= 32)
  622. {
  623. size32_t newSize = len * 4 / 5; // Copy if compresses less than 80% ...
  624. Owned<ICompressor> compressor = createLZWCompressor();
  625. void *newData = out.reserve(newSize);
  626. compressor->open(newData, newSize);
  627. if (compressor->write(src, len)==len)
  628. {
  629. compressor->close();
  630. size32_t compressedLen = compressor->buflen();
  631. out.setWritePos(originalLength + sizeof(bool));
  632. out.append(compressedLen);
  633. out.setWritePos(originalLength + sizeof(bool) + sizeof(size32_t) + compressedLen);
  634. return;
  635. }
  636. }
  637. // all or don't compress
  638. out.setWritePos(originalLength);
  639. appendToBuffer(out, len, src);
  640. }
  641. void decompressToBuffer(MemoryBuffer & out, const void * src)
  642. {
  643. Owned<IExpander> expander = createLZWExpander();
  644. unsigned outSize = expander->init(src);
  645. void * buff = out.reserve(outSize);
  646. expander->expand(buff);
  647. }
  648. void decompressToBuffer(MemoryBuffer & out, MemoryBuffer & in)
  649. {
  650. bool compressed;
  651. size32_t srcLen;
  652. in.read(compressed).read(srcLen);
  653. if (compressed)
  654. decompressToBuffer(out, in.readDirect(srcLen));
  655. else
  656. out.append(srcLen, in.readDirect(srcLen));
  657. }
  658. void decompressToAttr(MemoryAttr & out, const void * src)
  659. {
  660. Owned<IExpander> expander = createLZWExpander();
  661. unsigned outSize = expander->init(src);
  662. void * buff = out.allocate(outSize);
  663. expander->expand(buff);
  664. }
  665. void decompressToBuffer(MemoryAttr & out, MemoryBuffer & in)
  666. {
  667. bool compressed;
  668. size32_t srcLen;
  669. in.read(compressed).read(srcLen);
  670. if (compressed)
  671. decompressToAttr(out, in.readDirect(srcLen));
  672. else
  673. out.set(srcLen, in.readDirect(srcLen));
  674. }
  675. /*
  676. Simple Diff compression format is
  677. <compressed-block> ::= <initial-row> { <compressed-row> }
  678. <compressed-row> ::= { <same-count> <diff-count> <diff-bytes> }
  679. <same-count> ::= { 255 } <byte> -- value is sum
  680. <diff-count> ::= <byte>
  681. <diff-bytes> ::= { <bytes> }
  682. // note if diff-count is > 255 it will be broken into 255 diff followed by 0 same
  683. // also need at least 2 bytes same before stops difference block
  684. thus AAAAAA...AAAAAA [ len 500 ]
  685. followed by ADADAD...ADADAD
  686. will be saved as 1,255,ADADA..ADADA,0,244,ADADA..ADADA -> 503 bytes
  687. and AAAAAA...AAAAAA [ len 500 ]
  688. followed by AADDAA...AADDAA
  689. will be saved as 2,2,DD,2,2,DD...2,2,DD,2 -> 499 bytes
  690. and AAAAAA...AAAAAA [ len 500 ]
  691. followed by AAAAAA...AAAAAA
  692. will be saved as 255,245 -> 2 bytes
  693. and AAAAAA...AAAAAA [ len 500 ]
  694. followed by ZZZZZZ...ZZZZZZ
  695. will be saves as 0,255,ZZ..ZZ,0,245,ZZ..ZZ -> 504 bytes
  696. // maximum size is of a row is bounded by: rowsize+((rowsize+254)/255)*2;
  697. */
  698. size32_t DiffCompress(const void *src,void *dst,void *buff,size32_t rs)
  699. {
  700. const unsigned char *s=(const unsigned char *)src;
  701. unsigned char *d=(unsigned char *)dst;
  702. unsigned char *b=(unsigned char *)buff;
  703. ASSERT(rs);
  704. size32_t cnt;
  705. cnt = 0;
  706. while (*s==*b) {
  707. Loop:
  708. cnt++;
  709. rs--;
  710. if (rs==0) break;
  711. s++;
  712. b++;
  713. }
  714. while (cnt>=255) {
  715. *d = 255;
  716. d++;
  717. cnt-=255;
  718. }
  719. *d = (unsigned char)cnt;
  720. d++;
  721. if (rs!=0) {
  722. unsigned char *dcnt=d;
  723. d++;
  724. cnt = 0;
  725. while(1) {
  726. cnt++;
  727. *d = *s;
  728. d++;
  729. *b = *s;
  730. rs--;
  731. if (rs==0) {
  732. *dcnt=(unsigned char)cnt;
  733. break;
  734. }
  735. s++;
  736. b++;
  737. if (*s==*b) {
  738. if ((rs>1)&&(s[1]==b[1])) { // slower but slightly better compression
  739. *dcnt=(unsigned char)cnt;
  740. cnt = 0;
  741. goto Loop;
  742. }
  743. }
  744. if (cnt==255) {
  745. *dcnt=(unsigned char)cnt;
  746. *d = 0;
  747. d++;
  748. dcnt = d++;
  749. cnt = 0;
  750. }
  751. }
  752. }
  753. return (size32_t)(d-(unsigned char *)dst);
  754. }
  755. size32_t DiffCompress2(const void *src,void *dst,const void *prev,size32_t rs)
  756. {
  757. // doesn't update prev
  758. const unsigned char *s=(const unsigned char *)src;
  759. unsigned char *d=(unsigned char *)dst;
  760. const unsigned char *b=(unsigned char *)prev;
  761. ASSERT(rs);
  762. size32_t cnt;
  763. cnt = 0;
  764. while (*s==*b) {
  765. Loop:
  766. cnt++;
  767. rs--;
  768. if (rs==0) break;
  769. s++;
  770. b++;
  771. }
  772. while (cnt>=255) {
  773. *d = 255;
  774. d++;
  775. cnt-=255;
  776. }
  777. *d = (unsigned char)cnt;
  778. d++;
  779. if (rs!=0) {
  780. unsigned char *dcnt=d;
  781. d++;
  782. cnt = 0;
  783. while(1) {
  784. cnt++;
  785. *d = *s;
  786. d++;
  787. rs--;
  788. if (rs==0) {
  789. *dcnt=(unsigned char)cnt;
  790. break;
  791. }
  792. s++;
  793. b++;
  794. if (*s==*b) {
  795. if ((rs>1)&&(s[1]==b[1])) { // slower but slightly better compression
  796. *dcnt=(unsigned char)cnt;
  797. cnt = 0;
  798. goto Loop;
  799. }
  800. }
  801. if (cnt==255) {
  802. *dcnt=(unsigned char)cnt;
  803. *d = 0;
  804. d++;
  805. dcnt = d++;
  806. cnt = 0;
  807. }
  808. }
  809. }
  810. return (size32_t)(d-(unsigned char *)dst);
  811. }
  812. size32_t DiffCompressFirst(const void *src,void *dst,void *buf,size32_t rs)
  813. {
  814. memcpy(buf,src,rs);
  815. const unsigned char *s=(const unsigned char *)src;
  816. unsigned char *d=(unsigned char *)dst;
  817. *d = 0;
  818. d++;
  819. while (rs) {
  820. unsigned cnt=(rs<=255)?rs:255;
  821. *d=(unsigned char)cnt;
  822. d++;
  823. memcpy(d,s,cnt);
  824. d += cnt;
  825. s += cnt;
  826. *d = 0;
  827. d++;
  828. rs -= cnt;
  829. }
  830. return (size32_t)(d-(unsigned char *)dst);
  831. }
  832. size32_t DiffCompressedSize(const void *src,size32_t rs)
  833. {
  834. const unsigned char *s=(const unsigned char *)src;
  835. unsigned n;
  836. while (rs) {
  837. // first comes compressed
  838. do {
  839. n = *s;
  840. s++;
  841. rs -= n;
  842. } while (n==255);
  843. if (rs==0)
  844. break;
  845. n = *s;
  846. s++;
  847. rs -= n;
  848. s += n;
  849. }
  850. return (size32_t)(s-(const unsigned char *)src);
  851. }
  852. size32_t DiffExpand(const void *src,void *dst,const void *prev,size32_t rs)
  853. {
  854. unsigned char *s=(unsigned char *)src;
  855. unsigned char *d=(unsigned char *)dst;
  856. const unsigned char *b=(const unsigned char *)prev;
  857. ASSERT(rs);
  858. while (rs) {
  859. size32_t cnt = 0;
  860. size32_t c;
  861. do {
  862. c=(size32_t)*s;
  863. s++;
  864. cnt += c;
  865. } while (c==255);
  866. rs -= cnt;
  867. while (cnt!=0) {
  868. *d = *b;
  869. d++;
  870. b++;
  871. cnt--;
  872. }
  873. if ((int)rs<=0) {
  874. if (rs == 0)
  875. break;
  876. throw MakeStringException(-1,"Corrupt compressed data(1)");
  877. }
  878. cnt=(size32_t)*s;
  879. s++;
  880. rs -= cnt;
  881. b += cnt;
  882. const unsigned char *e = s+cnt;
  883. while (s!=e) {
  884. *d = *s;
  885. s++;
  886. d++;
  887. }
  888. }
  889. return (size32_t)(s-(unsigned char *)src);
  890. }
  891. // helper class
  892. class CDiffExpand
  893. {
  894. byte *s;
  895. const byte *b;
  896. size32_t rs;
  897. enum {
  898. S_pre_repeat,
  899. S_repeat,
  900. S_diff
  901. } state;
  902. size32_t cnt;
  903. public:
  904. inline void init(const void *src,const void *prev,size32_t _rs)
  905. {
  906. s=(byte *)src;
  907. b=(const byte *)prev;
  908. state = S_pre_repeat;
  909. rs = _rs;
  910. cnt = 0;
  911. }
  912. inline void skip(size32_t sz)
  913. {
  914. if (!sz)
  915. return;
  916. while (sz) {
  917. switch (state) {
  918. case S_pre_repeat:
  919. if (!rs)
  920. return;
  921. cnt = 0;
  922. size32_t c;
  923. do {
  924. c=(size32_t)*s;
  925. s++;
  926. cnt += c;
  927. } while (c==255);
  928. rs -= cnt;
  929. state = S_repeat;
  930. // fall through
  931. case S_repeat:
  932. if (cnt) {
  933. if (sz<=cnt) {
  934. cnt -= sz;
  935. b += sz;
  936. return;
  937. }
  938. b += cnt;
  939. sz-=cnt;
  940. }
  941. if ((int)rs<=0) {
  942. if (rs == 0)
  943. return;
  944. throw MakeStringException(-1,"Corrupt compressed data(2)");
  945. }
  946. cnt=(size32_t)*s;
  947. s++;
  948. rs -= cnt;
  949. b += cnt;
  950. state = S_diff;
  951. // fall through
  952. case S_diff:
  953. if (cnt) {
  954. if (sz<=cnt) {
  955. cnt -= sz;
  956. s += sz;
  957. return;
  958. }
  959. s += cnt;
  960. sz -= cnt;
  961. }
  962. state = S_pre_repeat;
  963. }
  964. }
  965. }
  966. inline size32_t cpy(void *dst,size32_t sz)
  967. {
  968. if (!sz)
  969. return 0;
  970. byte *d=(byte *)dst;
  971. loop {
  972. switch (state) {
  973. case S_pre_repeat:
  974. if (!rs)
  975. return (size32_t)(d-(byte *)dst);
  976. cnt = 0;
  977. size32_t c;
  978. do {
  979. c=(size32_t)*s;
  980. s++;
  981. cnt += c;
  982. } while (c==255);
  983. rs -= cnt;
  984. state = S_repeat;
  985. // fall through
  986. case S_repeat:
  987. if (cnt) {
  988. if (cnt>=sz) {
  989. memcpy(d,b,sz);
  990. b += sz;
  991. cnt -= sz;
  992. d += sz;
  993. return (size32_t)(d-(byte *)dst);
  994. }
  995. memcpy(d,b,cnt);
  996. b += cnt;
  997. d += cnt;
  998. sz -= cnt;
  999. }
  1000. if ((int)rs<=0) {
  1001. if (rs == 0)
  1002. return (size32_t)(d-(byte *)dst);
  1003. throw MakeStringException(-1,"Corrupt compressed data(3)");
  1004. }
  1005. cnt=(size32_t)*s;
  1006. s++;
  1007. rs -= cnt;
  1008. b += cnt;
  1009. state = S_diff;
  1010. // fall through
  1011. case S_diff:
  1012. if (cnt) {
  1013. if (cnt>=sz) {
  1014. memcpy(d,s,sz);
  1015. s += sz;
  1016. cnt -= sz;
  1017. d += sz;
  1018. return (size32_t)(d-(byte *)dst);
  1019. }
  1020. memcpy(d,s,cnt);
  1021. s += cnt;
  1022. d += cnt;
  1023. sz -= cnt;
  1024. }
  1025. state = S_pre_repeat;
  1026. }
  1027. }
  1028. return 0; // never gets here
  1029. }
  1030. inline int cmp(const void *dst,size32_t sz)
  1031. {
  1032. int ret;
  1033. if (!sz)
  1034. return rs?-1:0;
  1035. const byte *d=(const byte *)dst;
  1036. loop {
  1037. switch (state) {
  1038. case S_pre_repeat:
  1039. if (!rs)
  1040. return sz?1:0;
  1041. cnt = 0;
  1042. size32_t c;
  1043. do {
  1044. c=(size32_t)*s;
  1045. s++;
  1046. cnt += c;
  1047. } while (c==255);
  1048. rs -= cnt;
  1049. state = S_repeat;
  1050. // fall through
  1051. case S_repeat:
  1052. if (cnt) {
  1053. if (cnt>=sz) {
  1054. ret = memcmp(d,b,sz);
  1055. b += sz;
  1056. cnt -= sz;
  1057. return ret;
  1058. }
  1059. ret = memcmp(d,b,cnt);
  1060. b += cnt;
  1061. if (ret)
  1062. return ret;
  1063. d += cnt;
  1064. sz -= cnt;
  1065. }
  1066. if ((int)rs<=0) {
  1067. if (rs == 0)
  1068. return sz?1:0;
  1069. throw MakeStringException(-1,"Corrupt compressed data(4)");
  1070. }
  1071. cnt=(size32_t)*s;
  1072. s++;
  1073. rs -= cnt;
  1074. b += cnt;
  1075. state = S_diff;
  1076. // fall through
  1077. case S_diff:
  1078. if (cnt) {
  1079. if (cnt>=sz) {
  1080. ret = memcmp(d,s,sz);
  1081. s += sz;
  1082. cnt -= sz;
  1083. return ret;
  1084. }
  1085. ret = memcmp(d,s,cnt);
  1086. s += cnt;
  1087. if (ret)
  1088. return ret;
  1089. d += cnt;
  1090. sz -= cnt;
  1091. }
  1092. state = S_pre_repeat;
  1093. }
  1094. }
  1095. return 0; // never gets here
  1096. }
  1097. };
  1098. // =============================================================================
  1099. // RDIFF
  1100. // format ::= <expsize> <recsize> <plainrec> { <rowdif> }
  1101. class jlib_decl CRDiffCompressor : public CInterface, public ICompressor
  1102. {
  1103. size32_t inlen;
  1104. size32_t outlen;
  1105. size32_t bufalloc;
  1106. size32_t remaining;
  1107. void *outbuf;
  1108. unsigned char *out;
  1109. size32_t recsize; // assumed fixed length rows
  1110. // assumes a transaction is a record
  1111. MemoryBuffer transbuf;
  1112. size32_t maxrecsize; // maximum size diff compress
  1113. unsigned char *prev;
  1114. public:
  1115. IMPLEMENT_IINTERFACE;
  1116. CRDiffCompressor()
  1117. {
  1118. outbuf = NULL;
  1119. outlen = 0;
  1120. maxrecsize = 0;
  1121. recsize = 0;
  1122. bufalloc = 0;
  1123. prev = NULL;
  1124. }
  1125. ~CRDiffCompressor()
  1126. {
  1127. free(prev);
  1128. if (bufalloc)
  1129. free(outbuf);
  1130. }
  1131. void open(void *buf,size32_t max)
  1132. {
  1133. if (buf) {
  1134. if (bufalloc) {
  1135. free(outbuf);
  1136. }
  1137. bufalloc = 0;
  1138. outbuf = buf;
  1139. }
  1140. else if (max>bufalloc) {
  1141. if (bufalloc)
  1142. free(outbuf);
  1143. bufalloc = max;
  1144. outbuf = malloc(bufalloc);
  1145. }
  1146. ASSERT(max>2+sizeof(size32_t)*2); // minimum required (actually will need enough for recsize so only a guess)
  1147. inlen = 0;
  1148. memset(outbuf,0,sizeof(size32_t)*2);
  1149. outlen = sizeof(size32_t)*2;
  1150. remaining = max-outlen;
  1151. out = (unsigned char *)outbuf+sizeof(size32_t)*2;
  1152. free(prev);
  1153. prev = NULL;
  1154. }
  1155. void close()
  1156. {
  1157. transbuf.clear();
  1158. memcpy(outbuf,&inlen,sizeof(inlen)); // expanded size
  1159. memcpy((unsigned char *)outbuf+sizeof(inlen),&recsize,sizeof(recsize));
  1160. }
  1161. inline size32_t maxcompsize(size32_t s) { return s+((s+254)/255)*2; }
  1162. size32_t write(const void *buf,size32_t buflen)
  1163. {
  1164. // assumes a transaction is a row and at least one row fits in
  1165. if (prev) {
  1166. if ((transbuf.length()==0)&&(remaining<maxrecsize)) // this is a bit odd because no incremental diffcomp
  1167. return 0;
  1168. transbuf.append(buflen,buf);
  1169. }
  1170. else { // first row
  1171. if (buflen>remaining)
  1172. throw MakeStringException(-3,"CRDiffCompressor row doesn't fit in buffer!");
  1173. memcpy(out,buf,buflen);
  1174. out += buflen;
  1175. outlen += buflen;
  1176. }
  1177. // should inlen be updated here (probably not in transaction mode which is all this supports)
  1178. return buflen;
  1179. }
  1180. void startblock()
  1181. {
  1182. transbuf.clear();
  1183. }
  1184. void commitblock()
  1185. {
  1186. if (prev) {
  1187. if (recsize!=transbuf.length())
  1188. throw MakeStringException(-1,"CRDiffCompressor used with variable sized row");
  1189. size32_t sz = DiffCompress(transbuf.toByteArray(),out,prev,recsize);
  1190. transbuf.clear();
  1191. out += sz;
  1192. outlen += sz;
  1193. remaining -= sz;
  1194. }
  1195. else {
  1196. recsize = outlen-sizeof(size32_t)*2;
  1197. maxrecsize = maxcompsize(recsize);
  1198. prev = (unsigned char *)malloc(recsize);
  1199. memcpy(prev,out-recsize,recsize);
  1200. remaining -= recsize;
  1201. }
  1202. inlen += recsize;
  1203. }
  1204. virtual void *bufptr() { return outbuf;}
  1205. virtual size32_t buflen() { return outlen;}
  1206. };
  1207. class jlib_decl CRDiffExpander : public CInterface, public IExpander
  1208. {
  1209. unsigned char *outbuf;
  1210. size32_t outlen;
  1211. size32_t bufalloc;
  1212. unsigned char *in;
  1213. size32_t recsize;
  1214. public:
  1215. IMPLEMENT_IINTERFACE;
  1216. CRDiffExpander()
  1217. {
  1218. outbuf = NULL;
  1219. outlen = 0;
  1220. bufalloc = 0;
  1221. recsize = 0;
  1222. }
  1223. ~CRDiffExpander()
  1224. {
  1225. if (bufalloc)
  1226. free(outbuf);
  1227. }
  1228. size32_t init(const void *blk) // returns size required
  1229. {
  1230. memcpy(&outlen,blk,sizeof(outlen));
  1231. memcpy(&recsize,(unsigned char *)blk+sizeof(outlen),sizeof(recsize));
  1232. in=(unsigned char *)blk+sizeof(outlen)+sizeof(recsize);
  1233. return outlen;
  1234. }
  1235. void expand(void *buf)
  1236. {
  1237. if (!outlen)
  1238. return;
  1239. if (buf) {
  1240. if (bufalloc)
  1241. free(outbuf);
  1242. bufalloc = 0;
  1243. outbuf = (unsigned char *)buf;
  1244. }
  1245. else if (outlen>bufalloc) {
  1246. if (bufalloc)
  1247. free(outbuf);
  1248. bufalloc = outlen;
  1249. outbuf = (unsigned char *)malloc(bufalloc);
  1250. }
  1251. if (outlen<recsize)
  1252. throw MakeStringException(-1,"CRDiffExpander: invalid buffer format");
  1253. unsigned char *out=outbuf;
  1254. memcpy(out,in,recsize);
  1255. const unsigned char *prev = out;
  1256. out += recsize;
  1257. in += recsize;
  1258. size_t remaining = outlen-recsize;
  1259. while (remaining) {
  1260. if (remaining<recsize)
  1261. throw MakeStringException(-2,"CRDiffExpander: invalid buffer format");
  1262. size32_t sz = DiffExpand(in,out,prev,recsize);
  1263. in += sz;
  1264. prev = out;
  1265. out += recsize;
  1266. remaining -= recsize;
  1267. }
  1268. }
  1269. virtual void *bufptr() { return outbuf;}
  1270. virtual size32_t buflen() { return outlen;}
  1271. };
  1272. ICompressor *createRDiffCompressor()
  1273. {
  1274. return new CRDiffCompressor;
  1275. }
  1276. IExpander *createRDiffExpander()
  1277. {
  1278. return new CRDiffExpander;
  1279. }
  1280. // =============================================================================
  1281. // RANDRDIFF
  1282. // format ::= <totsize> <0xffff> <recsize> <firstrlesize> <numrows> { <rowofs> } <difrecs> <firsrecrle>
  1283. // all 16bit except recs
  1284. struct RRDheader
  1285. {
  1286. unsigned short totsize;
  1287. unsigned short flag;
  1288. unsigned short recsize;
  1289. unsigned short firstrlesize;
  1290. unsigned short numrows;
  1291. unsigned short rowofs[0x3fff];
  1292. inline unsigned short hsize() { return (5+numrows)*sizeof(short); }
  1293. };
  1294. #define MIN_RRDHEADER_SIZE (5*sizeof(short))
  1295. class jlib_decl CRandRDiffCompressor : public CInterface, public ICompressor
  1296. {
  1297. size32_t inlen;
  1298. size32_t bufalloc;
  1299. size32_t max;
  1300. void *outbuf;
  1301. RRDheader *header;
  1302. // assumes a transaction is a record
  1303. MemoryBuffer rowbuf;
  1304. MemoryBuffer diffbuf;
  1305. MemoryBuffer firstrec;
  1306. MemoryAttr firstrle;
  1307. size32_t maxdiffsize;
  1308. size32_t recsize;
  1309. size32_t compsize;
  1310. public:
  1311. IMPLEMENT_IINTERFACE;
  1312. CRandRDiffCompressor()
  1313. {
  1314. outbuf = NULL;
  1315. header = NULL;
  1316. bufalloc = 0;
  1317. max = 0;
  1318. maxdiffsize = 0;
  1319. recsize = 0;
  1320. }
  1321. ~CRandRDiffCompressor()
  1322. {
  1323. if (bufalloc)
  1324. free(outbuf);
  1325. }
  1326. void open(void *buf,size32_t _max)
  1327. {
  1328. max = _max;
  1329. if (buf) {
  1330. if (bufalloc) {
  1331. free(outbuf);
  1332. }
  1333. bufalloc = 0;
  1334. outbuf = buf;
  1335. }
  1336. else if (max>bufalloc) {
  1337. if (bufalloc)
  1338. free(outbuf);
  1339. bufalloc = max;
  1340. outbuf = malloc(bufalloc);
  1341. }
  1342. ASSERT(max>MIN_RRDHEADER_SIZE+sizeof(unsigned short)+3); // hopefully a lot bigger!
  1343. header = (RRDheader *)outbuf;
  1344. inlen = 0;
  1345. memset(header,0,MIN_RRDHEADER_SIZE);
  1346. diffbuf.clear();
  1347. firstrec.clear();
  1348. firstrle.clear();
  1349. rowbuf.clear();
  1350. }
  1351. void close()
  1352. {
  1353. header->rowofs[0] = (unsigned short)diffbuf.length();
  1354. ASSERT((size32_t)(header->totsize+header->firstrlesize)<=max);
  1355. unsigned short hofs = header->hsize();
  1356. ASSERT(header->totsize==hofs+diffbuf.length());
  1357. byte *out = (byte *)outbuf+hofs;
  1358. memcpy(out,diffbuf.toByteArray(),diffbuf.length());
  1359. out += diffbuf.length();
  1360. diffbuf.clear();
  1361. memcpy(out,firstrle.bufferBase(),header->firstrlesize);
  1362. header->totsize += header->firstrlesize;
  1363. firstrle.clear();
  1364. firstrec.clear();
  1365. header->flag = 0xffff;
  1366. // adjust offsets
  1367. unsigned i = header->numrows;
  1368. while (i--)
  1369. header->rowofs[i] += hofs;
  1370. }
  1371. inline size32_t maxcompsize(size32_t s) { return s+((s+254)/255)*2; }
  1372. size32_t write(const void *buf,size32_t buflen)
  1373. {
  1374. // assumes a transaction is a row and at least one row fits in
  1375. unsigned nr = header->numrows;
  1376. if (nr) {
  1377. rowbuf.append(buflen,buf);
  1378. if (rowbuf.length()==recsize) { // because no incremental diffcomp do here
  1379. size32_t sz = diffbuf.length();
  1380. compsize = DiffCompress2(rowbuf.toByteArray(),diffbuf.reserve(maxdiffsize),firstrec.toByteArray(),recsize);
  1381. if (header->totsize+sizeof(short)+compsize+header->firstrlesize>max) {
  1382. diffbuf.setLength(sz);
  1383. return 0;
  1384. }
  1385. header->rowofs[nr] = (unsigned short)sz; // will need to adjust later
  1386. diffbuf.setLength(sz+compsize);
  1387. }
  1388. }
  1389. else
  1390. firstrec.append(buflen,buf);
  1391. return buflen;
  1392. }
  1393. void startblock()
  1394. {
  1395. rowbuf.clear();
  1396. }
  1397. void commitblock()
  1398. {
  1399. unsigned nr = header->numrows;
  1400. if (nr) {
  1401. if (recsize!=rowbuf.length())
  1402. throw MakeStringException(-1,"CRandDiffCompressor used with variable sized row");
  1403. rowbuf.clear();
  1404. header->numrows++;
  1405. header->totsize += (unsigned short)compsize+sizeof(unsigned short);
  1406. }
  1407. else {
  1408. header->numrows = 1;
  1409. header->totsize = header->hsize(); // don't add in rle size yet
  1410. recsize = firstrec.length();
  1411. header->recsize = (unsigned short)recsize;
  1412. maxdiffsize = maxcompsize(recsize);
  1413. size32_t sz = RLECompress(firstrle.allocate(recsize+2),firstrec.toByteArray(),recsize);
  1414. header->firstrlesize = (unsigned short)sz;
  1415. }
  1416. inlen += recsize;
  1417. }
  1418. void *bufptr() { return outbuf;}
  1419. size32_t buflen() { return header->totsize;}
  1420. };
  1421. class jlib_decl CRandRDiffExpander : public CInterface, public IRandRowExpander
  1422. {
  1423. MemoryAttr buf;
  1424. const RRDheader *header;
  1425. size32_t recsize;
  1426. unsigned numrows;
  1427. byte *firstrow;
  1428. inline byte *rowptr(unsigned idx) const { return (byte *)header+header->rowofs[idx]; }
  1429. public:
  1430. IMPLEMENT_IINTERFACE;
  1431. CRandRDiffExpander()
  1432. {
  1433. recsize = 0;
  1434. numrows = 0;
  1435. header = NULL;
  1436. }
  1437. ~CRandRDiffExpander()
  1438. {
  1439. }
  1440. bool init(const void *blk,bool copy)
  1441. {
  1442. // if copy then use new block with first row at end
  1443. header=(const RRDheader *)blk;
  1444. if (header->flag!=0xffff) // flag
  1445. return false;
  1446. recsize = header->recsize;
  1447. numrows = header->numrows;
  1448. RRDheader *headercopy;
  1449. if (copy) {
  1450. size32_t sz = header->totsize-header->firstrlesize+recsize;
  1451. headercopy = (RRDheader *)buf.allocate(sz);
  1452. memcpy(headercopy,blk,header->totsize-header->firstrlesize);
  1453. firstrow = (byte *)headercopy+headercopy->rowofs[0];
  1454. headercopy->totsize = (unsigned short)sz;
  1455. }
  1456. else
  1457. firstrow = (byte *)buf.allocate(recsize);
  1458. RLEExpand(firstrow,(const byte *)header+header->rowofs[0],recsize);
  1459. if (copy)
  1460. header = headercopy;
  1461. return true;
  1462. }
  1463. bool expandRow(void *target,unsigned idx) const
  1464. {
  1465. if (idx>=numrows)
  1466. return false;
  1467. if (idx)
  1468. DiffExpand(rowptr(idx),target,firstrow,recsize);
  1469. else
  1470. memcpy(target, firstrow, recsize);
  1471. return true;
  1472. }
  1473. size32_t expandRow(void *target,unsigned idx,size32_t ofs,size32_t sz) const
  1474. {
  1475. if ((idx>=numrows)||(ofs>=recsize))
  1476. return 0;
  1477. if (sz>recsize-ofs)
  1478. sz = recsize-ofs;
  1479. if (idx==0)
  1480. memcpy(target,firstrow+ofs,sz);
  1481. else if ((ofs==0)&&(sz>=recsize))
  1482. DiffExpand(rowptr(idx),target,firstrow,recsize);
  1483. else {
  1484. CDiffExpand exp;
  1485. exp.init(rowptr(idx),firstrow,recsize);
  1486. exp.skip(ofs);
  1487. exp.cpy(target,sz);
  1488. }
  1489. return sz;
  1490. }
  1491. int cmpRow(const void *target,unsigned idx,size32_t ofs=0,size32_t sz=(size32_t)-1) const
  1492. {
  1493. if ((idx>=numrows)||(ofs>=recsize))
  1494. return -1;
  1495. if (sz>=recsize-ofs)
  1496. sz = recsize-ofs;
  1497. if (idx==0)
  1498. return memcmp(target,firstrow+ofs,sz);
  1499. CDiffExpand exp;
  1500. exp.init(rowptr(idx),firstrow,recsize);
  1501. exp.skip(ofs);
  1502. return exp.cmp(target,sz);
  1503. }
  1504. size32_t rowSize() const { return recsize; }
  1505. unsigned numRows() const { return numrows; }
  1506. const byte *firstRow() const { return firstrow; }
  1507. };
  1508. ICompressor *createRandRDiffCompressor()
  1509. {
  1510. return new CRandRDiffCompressor;
  1511. }
  1512. IRandRowExpander *createRandRDiffExpander()
  1513. {
  1514. return new CRandRDiffExpander;
  1515. }
  1516. // =============================================================================
  1517. // Compressed files
  1518. typedef enum { ICFcreate, ICFread, ICFappend } ICFmode;
  1519. #define COMPRESSEDFILEFLAG (I64C(0xc0528ce99f10da55))
  1520. #define COMPRESSEDFILEBLOCKSIZE (0x10000)
  1521. #define FASTCOMPRESSEDFILEFLAG (I64C(0xc1518de99f10da55))
  1522. #pragma pack(push,1)
  1523. struct CompressedFileTrailer
  1524. {
  1525. unsigned datacrc;
  1526. offset_t expandedSize;
  1527. offset_t indexPos; // end of blocks
  1528. size32_t blockSize;
  1529. size32_t recordSize; // 0 is lzw compressed
  1530. __int64 compressedType;
  1531. unsigned crc; // must be last
  1532. unsigned numBlocks() { return (unsigned)((indexPos+blockSize-1)/blockSize); }
  1533. unsigned method()
  1534. {
  1535. if (recordSize)
  1536. return COMPRESS_METHOD_ROWDIF;
  1537. if (compressedType==COMPRESSEDFILEFLAG)
  1538. return COMPRESS_METHOD_LZW;
  1539. if (compressedType==FASTCOMPRESSEDFILEFLAG)
  1540. return COMPRESS_METHOD_FASTLZ;
  1541. return 0;
  1542. }
  1543. void setDetails(IPropertyTree &tree)
  1544. {
  1545. tree.setPropInt("@datacrc",datacrc);
  1546. tree.setPropInt64("@expandedSize",expandedSize);
  1547. tree.setPropInt64("@indexPos",indexPos);
  1548. tree.setPropInt("@blockSize",blockSize);
  1549. tree.setPropInt("@recordSize",recordSize); // 0 is lzw compressed
  1550. tree.setPropInt64("@compressedType",compressedType);
  1551. tree.setPropInt("@method",method());
  1552. tree.setPropInt("@crc",crc);
  1553. tree.setPropInt("@numblocks",(unsigned)((indexPos+blockSize-1)/blockSize));
  1554. }
  1555. };
  1556. // backward compatibility - will remove at some point
  1557. struct WinCompressedFileTrailer
  1558. {
  1559. unsigned datacrc;
  1560. unsigned filler1;
  1561. offset_t expandedSize;
  1562. offset_t indexPos; // end of blocks
  1563. size32_t blockSize;
  1564. size32_t recordSize; // 0 is lzw compressed
  1565. __int64 compressedType;
  1566. unsigned crc; // must be last
  1567. unsigned filler2;
  1568. void translate(CompressedFileTrailer &out)
  1569. {
  1570. if (compressedType==COMPRESSEDFILEFLAG) {
  1571. out.datacrc = datacrc;
  1572. out.expandedSize = expandedSize;
  1573. out.indexPos = indexPos;
  1574. out.blockSize = blockSize;
  1575. out.recordSize = recordSize;
  1576. out.compressedType = compressedType;
  1577. out.crc = crc;
  1578. }
  1579. else {
  1580. memcpy(&out,(byte *)this+sizeof(WinCompressedFileTrailer)-sizeof(CompressedFileTrailer),sizeof(CompressedFileTrailer));
  1581. }
  1582. }
  1583. };
  1584. #pragma pack(pop)
  1585. class CCompressedFile : public CInterface, implements ICompressedFileIO
  1586. {
  1587. Linked<IFileIO> fileio;
  1588. Linked<IMemoryMappedFile> mmfile;
  1589. CompressedFileTrailer trailer;
  1590. unsigned curblocknum;
  1591. offset_t curblockpos; // logical pos (reading only)
  1592. MemoryBuffer curblockbuf; // expanded buffer when reading
  1593. MemoryAttr compblk;
  1594. byte *compblkptr;
  1595. size32_t compblklen;
  1596. MemoryAttr compbuf;
  1597. MemoryBuffer indexbuf; // non-empty once index read
  1598. ICFmode mode;
  1599. CriticalSection crit;
  1600. MemoryBuffer overflow; // where partial row written
  1601. MemoryAttr prevrowbuf;
  1602. bool checkcrc;
  1603. bool setcrc;
  1604. Owned<ICompressor> compressor;
  1605. Owned<IExpander> expander;
  1606. unsigned indexNum() { return indexbuf.length()/sizeof(offset_t); }
  1607. unsigned lookupIndex(offset_t pos,offset_t &curpos,size32_t &expsize)
  1608. {
  1609. // NB index starts at block 1 (and has size as last entry)
  1610. const offset_t *index;
  1611. unsigned a = 0;
  1612. unsigned b = indexNum();
  1613. index = (const offset_t *)indexbuf.toByteArray();
  1614. while (b>a) {
  1615. unsigned m = (a+b)/2;
  1616. __int64 dif = (__int64)pos-index[m];
  1617. if (dif==0) {
  1618. b = m+1;
  1619. a = b;
  1620. }
  1621. else if (dif>0)
  1622. a = m+1;
  1623. else
  1624. b = m;
  1625. }
  1626. curpos=b?index[b-1]:0;
  1627. expsize = (size32_t)(index[b]-curpos);
  1628. return b;
  1629. }
  1630. void getblock(offset_t pos)
  1631. {
  1632. curblockbuf.clear();
  1633. size32_t expsize;
  1634. curblocknum = lookupIndex(pos,curblockpos,expsize);
  1635. size32_t toread = trailer.blockSize;
  1636. offset_t p = (offset_t)curblocknum*toread;
  1637. assertex(p<=trailer.indexPos);
  1638. if (trailer.indexPos-p<(offset_t)toread)
  1639. toread = (size32_t)(trailer.indexPos-p);
  1640. if (!toread)
  1641. return;
  1642. if (fileio) {
  1643. MemoryAttr comp;
  1644. void *b=comp.allocate(toread);
  1645. size32_t r = fileio->read(p,toread,b);
  1646. assertex(r==toread);
  1647. expand(b,curblockbuf,expsize);
  1648. }
  1649. else { // memory mapped
  1650. assertex((memsize_t)p==p);
  1651. expand(mmfile->base()+(memsize_t)p,curblockbuf,expsize);
  1652. }
  1653. }
  1654. void checkedwrite(offset_t pos, size32_t len, const void * data)
  1655. {
  1656. size32_t ret = fileio->write(pos,len,data);
  1657. if (ret!=len)
  1658. throw MakeStringException(DISK_FULL_EXCEPTION_CODE,"CCompressedFile::checkedwrite");
  1659. if (setcrc)
  1660. trailer.crc = crc32((const char *)data,len,trailer.crc);
  1661. }
  1662. void flush()
  1663. {
  1664. curblocknum++;
  1665. indexbuf.append((unsigned __int64) trailer.expandedSize-overflow.length());
  1666. offset_t p = ((offset_t)curblocknum)*((offset_t)trailer.blockSize);
  1667. if (trailer.recordSize==0) {
  1668. compressor->close();
  1669. compblklen = compressor->buflen();
  1670. }
  1671. if (compblklen) {
  1672. if (p>trailer.indexPos) { // fill gap
  1673. MemoryAttr fill;
  1674. size32_t fl = (size32_t)(p-trailer.indexPos);
  1675. memset(fill.allocate(fl),0xff,fl);
  1676. checkedwrite(trailer.indexPos,fl,fill.get());
  1677. }
  1678. checkedwrite(p,compblklen,compblkptr);
  1679. p += compblklen;
  1680. compblklen = 0;
  1681. }
  1682. trailer.indexPos = p;
  1683. if (trailer.recordSize==0) {
  1684. compressor->open(compblkptr, trailer.blockSize);
  1685. }
  1686. }
  1687. virtual void expand(const void *compbuf,MemoryBuffer &expbuf,size32_t expsize)
  1688. {
  1689. size32_t rs = trailer.recordSize;
  1690. if (rs) { // diff compress
  1691. const byte *src = (const byte *)compbuf;
  1692. byte *dst = (byte *)expbuf.reserve(expsize);
  1693. if (expsize) {
  1694. assertex(expsize>=rs);
  1695. memcpy(dst,src,rs);
  1696. dst += rs;
  1697. src += rs;
  1698. expsize -= rs;
  1699. while (expsize) {
  1700. assertex(expsize>=rs);
  1701. src += DiffExpand(src, dst, dst-rs, rs);
  1702. expsize -= rs;
  1703. dst += rs;
  1704. }
  1705. }
  1706. }
  1707. else { // lzw
  1708. assertex(expander.get());
  1709. size32_t exp = expander->init(compbuf);
  1710. if (exp!=expsize) {
  1711. throw MakeStringException(-1,"Compressed file format failure(%d,%d) - Encrypted?",exp,expsize);
  1712. }
  1713. expander->expand(expbuf.reserve(exp));
  1714. }
  1715. }
  1716. bool compressrow(const void *src,size32_t rs)
  1717. {
  1718. bool ret = true;
  1719. if (compblklen==0) {
  1720. memcpy(prevrowbuf.bufferBase(),src,rs);
  1721. memcpy(compblkptr,src,rs);
  1722. compblklen = rs;
  1723. }
  1724. else {
  1725. size32_t len = DiffCompress(src,compblkptr+compblklen,prevrowbuf.bufferBase(),rs);
  1726. if (compblklen+len>trailer.blockSize)
  1727. ret = false;
  1728. else
  1729. compblklen += len;
  1730. }
  1731. return ret;
  1732. }
  1733. size32_t compress(const void *expbuf,size32_t len) // iff return!=len then overflowed
  1734. {
  1735. const byte *src = (const byte *)expbuf;
  1736. size32_t rs = trailer.recordSize;
  1737. if (rs) { // diff compress
  1738. if (overflow.length()) {
  1739. assertex(overflow.length()<=rs);
  1740. size32_t left = rs-overflow.length();
  1741. if (left>len)
  1742. left = len;
  1743. overflow.append(left,expbuf);
  1744. len -= left;
  1745. if (overflow.length()==rs) {
  1746. if (!compressrow(overflow.toByteArray(),rs)) { // this is nasty case
  1747. overflow.setLength(rs-left);
  1748. return (size32_t)(src-(const byte *)expbuf);
  1749. }
  1750. overflow.clear();
  1751. }
  1752. src += left;
  1753. }
  1754. while (len>=rs) {
  1755. if (!compressrow(src,rs))
  1756. return (size32_t)(src-(const byte *)expbuf);
  1757. len -= rs;
  1758. src += rs;
  1759. }
  1760. if (len) {
  1761. overflow.append(len,src);
  1762. src += len;
  1763. }
  1764. }
  1765. else
  1766. src += compressor->write(src, len);
  1767. return (size32_t)(src-(const byte *)expbuf);
  1768. }
  1769. public:
  1770. IMPLEMENT_IINTERFACE;
  1771. CCompressedFile(IFileIO *_fileio,IMemoryMappedFile *_mmfile,CompressedFileTrailer &_trailer,ICFmode _mode, bool _setcrc,ICompressor *_compressor,IExpander *_expander, bool fast)
  1772. : fileio(_fileio), mmfile(_mmfile)
  1773. {
  1774. compressor.set(_compressor);
  1775. expander.set(_expander);
  1776. setcrc = _setcrc;
  1777. memcpy(&trailer,&_trailer,sizeof(trailer));
  1778. mode = _mode;
  1779. curblockpos = 0;
  1780. curblocknum = (unsigned)-1; // relies on wrap
  1781. if (mode!=ICFread) {
  1782. if (!_fileio&&_mmfile)
  1783. throw MakeStringException(-1,"Compressed Write not supported on memory mapped files");
  1784. if (trailer.recordSize) {
  1785. if ((trailer.recordSize>trailer.blockSize/4) || // just too big
  1786. (trailer.recordSize<10)) // or too small
  1787. trailer.recordSize = 0;
  1788. else
  1789. prevrowbuf.allocate(trailer.recordSize);
  1790. }
  1791. compblkptr = (byte *)compblk.allocate(trailer.blockSize+trailer.recordSize*2+16); // over estimate!
  1792. compblklen = 0;
  1793. if (trailer.recordSize==0) {
  1794. if (!compressor)
  1795. {
  1796. if (fast)
  1797. compressor.setown(createFastLZCompressor());
  1798. else
  1799. compressor.setown(createLZWCompressor(true));
  1800. }
  1801. compressor->open(compblkptr, trailer.blockSize);
  1802. }
  1803. }
  1804. if (mode!=ICFcreate) {
  1805. unsigned nb = trailer.numBlocks();
  1806. size32_t toread = sizeof(offset_t)*nb;
  1807. if (fileio) {
  1808. size32_t r = fileio->read(trailer.indexPos,toread,indexbuf.reserveTruncate(toread));
  1809. assertex(r==toread);
  1810. }
  1811. else {
  1812. assertex((memsize_t)trailer.indexPos==trailer.indexPos);
  1813. memcpy(indexbuf.reserveTruncate(toread),mmfile->base()+(memsize_t)trailer.indexPos,toread);
  1814. }
  1815. if (mode==ICFappend) {
  1816. curblocknum = nb-1;
  1817. if (setcrc) {
  1818. trailer.crc = trailer.datacrc;
  1819. trailer.datacrc = ~0U;
  1820. }
  1821. }
  1822. if (trailer.recordSize==0) {
  1823. if (!expander) {
  1824. if (fast)
  1825. expander.setown(createFastLZExpander());
  1826. else
  1827. expander.setown(createLZWExpander(true));
  1828. }
  1829. }
  1830. }
  1831. }
  1832. virtual ~CCompressedFile()
  1833. {
  1834. close();
  1835. }
  1836. virtual offset_t size()
  1837. {
  1838. CriticalBlock block(crit);
  1839. return trailer.expandedSize;
  1840. }
  1841. virtual size32_t read(offset_t pos, size32_t len, void * data)
  1842. {
  1843. CriticalBlock block(crit);
  1844. assertex(mode==ICFread);
  1845. size32_t ret=0;
  1846. while (pos<trailer.expandedSize) {
  1847. if ((offset_t)len>trailer.expandedSize-pos)
  1848. len = (size32_t)(trailer.expandedSize-pos);
  1849. if ((pos>=curblockpos)&&(pos<curblockpos+curblockbuf.length())) { // see if in current buffer
  1850. size32_t tocopy = (size32_t)(curblockpos+curblockbuf.length()-pos);
  1851. if (tocopy>len)
  1852. tocopy = len;
  1853. memcpy(data,curblockbuf.toByteArray()+(pos-curblockpos),tocopy);
  1854. ret += tocopy;
  1855. len -= tocopy;
  1856. data = (byte *)data+tocopy;
  1857. pos += tocopy;
  1858. }
  1859. if (len==0)
  1860. break;
  1861. getblock(pos);
  1862. }
  1863. return ret;
  1864. }
  1865. size32_t write(offset_t pos, size32_t len, const void * data)
  1866. {
  1867. CriticalBlock block(crit);
  1868. assertex(mode!=ICFread);
  1869. size32_t ret = 0;
  1870. loop {
  1871. if (pos!=trailer.expandedSize)
  1872. throw MakeStringException(-1,"sequential writes only on compressed file");
  1873. size32_t done = compress(data,len);
  1874. trailer.expandedSize += done;
  1875. len -= done;
  1876. ret += done;
  1877. pos += done;
  1878. data = (const byte *)data+done;
  1879. if (len==0)
  1880. break;
  1881. flush();
  1882. }
  1883. return ret;
  1884. }
  1885. void setSize(offset_t size) { UNIMPLEMENTED; }
  1886. offset_t appendFile(IFile *file,offset_t pos,offset_t len) { UNIMPLEMENTED; }
  1887. void close()
  1888. {
  1889. CriticalBlock block(crit);
  1890. if (mode!=ICFread) {
  1891. if (overflow.length()) {
  1892. unsigned ol = overflow.length();
  1893. overflow.clear();
  1894. throw MakeStringException(-1,"Partial row written at end of file %d of %d",ol,trailer.recordSize);
  1895. }
  1896. flush();
  1897. trailer.datacrc = trailer.crc;
  1898. if (setcrc) {
  1899. indexbuf.append(sizeof(trailer)-sizeof(trailer.crc),&trailer);
  1900. trailer.crc = crc32((const char *)indexbuf.toByteArray(),
  1901. indexbuf.length(),trailer.crc);
  1902. indexbuf.append(trailer.crc);
  1903. }
  1904. else {
  1905. trailer.datacrc = 0;
  1906. trailer.crc = ~0U;
  1907. indexbuf.append(sizeof(trailer),&trailer);
  1908. }
  1909. checkedwrite(trailer.indexPos,indexbuf.length(),indexbuf.toByteArray());
  1910. indexbuf.clear();
  1911. }
  1912. mode = ICFread;
  1913. curblockpos = 0;
  1914. curblocknum = (unsigned)-1; // relies on wrap
  1915. }
  1916. unsigned dataCRC()
  1917. {
  1918. if (mode==ICFread)
  1919. return trailer.datacrc;
  1920. return trailer.crc;
  1921. }
  1922. size32_t recordSize()
  1923. {
  1924. return trailer.recordSize;
  1925. }
  1926. size32_t blockSize()
  1927. {
  1928. return trailer.blockSize;
  1929. }
  1930. void setBlockSize(size32_t size)
  1931. {
  1932. trailer.blockSize = size;
  1933. }
  1934. bool readMode()
  1935. {
  1936. return (mode==ICFread);
  1937. }
  1938. unsigned method()
  1939. {
  1940. return trailer.method();
  1941. }
  1942. };
  1943. ICompressedFileIO *createCompressedFileReader(IFileIO *fileio,IExpander *expander)
  1944. {
  1945. if (fileio) {
  1946. offset_t fsize = fileio->size();
  1947. if (fsize>=sizeof(WinCompressedFileTrailer)) { // thats 8 bytes bigger but I think doesn't matter
  1948. WinCompressedFileTrailer wintrailer;
  1949. CompressedFileTrailer trailer;
  1950. if (fileio->read(fsize-sizeof(WinCompressedFileTrailer),sizeof(WinCompressedFileTrailer),&wintrailer)==sizeof(WinCompressedFileTrailer)) {
  1951. wintrailer.translate(trailer);
  1952. if ((trailer.compressedType==COMPRESSEDFILEFLAG)||(trailer.compressedType==FASTCOMPRESSEDFILEFLAG)) {
  1953. if (expander&&(trailer.recordSize!=0)) {
  1954. throw MakeStringException(-1, "Compressed file format error(%d), Encrypted?",trailer.recordSize);
  1955. }
  1956. CCompressedFile *cfile = new CCompressedFile(fileio,NULL,trailer,ICFread,false,NULL,expander,(trailer.compressedType==FASTCOMPRESSEDFILEFLAG));
  1957. return cfile;
  1958. }
  1959. }
  1960. }
  1961. }
  1962. return NULL;
  1963. }
  1964. ICompressedFileIO *createCompressedFileReader(IFile *file,IExpander *expander, bool memorymapped, IFEflags extraFlags)
  1965. {
  1966. if (file) {
  1967. if (memorymapped) {
  1968. Owned<IMemoryMappedFile> mmfile = file->openMemoryMapped();
  1969. if (mmfile) {
  1970. offset_t fsize = mmfile->fileSize();
  1971. if (fsize>=sizeof(WinCompressedFileTrailer)) { // thats 8 bytes bigger but I think doesn't matter
  1972. WinCompressedFileTrailer wintrailer;
  1973. CompressedFileTrailer trailer;
  1974. memcpy(&wintrailer,mmfile->base()+fsize-sizeof(WinCompressedFileTrailer),sizeof(WinCompressedFileTrailer));
  1975. wintrailer.translate(trailer);
  1976. if ((trailer.compressedType==COMPRESSEDFILEFLAG)||(trailer.compressedType==FASTCOMPRESSEDFILEFLAG)) {
  1977. if (expander&&(trailer.recordSize!=0)) {
  1978. throw MakeStringException(-1, "Compressed file format error(%d), Encrypted?",trailer.recordSize);
  1979. }
  1980. CCompressedFile *cfile = new CCompressedFile(NULL,mmfile,trailer,ICFread,false,NULL,expander,(trailer.compressedType==FASTCOMPRESSEDFILEFLAG));
  1981. return cfile;
  1982. }
  1983. }
  1984. }
  1985. }
  1986. Owned<IFileIO> fileio = file->open(IFOread, extraFlags);
  1987. if (fileio)
  1988. return createCompressedFileReader(fileio,expander);
  1989. }
  1990. return NULL;
  1991. }
  1992. ICompressedFileIO *createCompressedFileWriter(IFileIO *fileio,size32_t recordsize,bool _setcrc,ICompressor *compressor,bool fast)
  1993. {
  1994. CompressedFileTrailer trailer;
  1995. offset_t fsize = fileio->size();
  1996. if (fsize) {
  1997. loop {
  1998. if (fsize>=sizeof(WinCompressedFileTrailer)) { // thats 8 bytes bigger but I think doesn't matter
  1999. WinCompressedFileTrailer wintrailer;
  2000. CompressedFileTrailer trailer;
  2001. if (fileio->read(fsize-sizeof(WinCompressedFileTrailer),sizeof(WinCompressedFileTrailer),&wintrailer)==sizeof(WinCompressedFileTrailer)) {
  2002. wintrailer.translate(trailer);
  2003. if ((trailer.compressedType==COMPRESSEDFILEFLAG)||(trailer.compressedType==FASTCOMPRESSEDFILEFLAG)) {
  2004. if ((recordsize==trailer.recordSize)||!trailer.recordSize)
  2005. break;
  2006. throw MakeStringException(-1,"Appending to file with different record size (%d,%d)",recordsize,trailer.recordSize);
  2007. }
  2008. }
  2009. }
  2010. throw MakeStringException(-1,"Appending to file that is not compressed");
  2011. }
  2012. }
  2013. else {
  2014. memset(&trailer,0,sizeof(trailer));
  2015. trailer.crc = ~0U;
  2016. trailer.compressedType = fast?FASTCOMPRESSEDFILEFLAG:COMPRESSEDFILEFLAG;
  2017. trailer.blockSize = COMPRESSEDFILEBLOCKSIZE;
  2018. trailer.recordSize = recordsize;
  2019. }
  2020. if (compressor)
  2021. trailer.recordSize = 0; // force not row compressed if compressor specified
  2022. CCompressedFile *cfile = new CCompressedFile(fileio,NULL,trailer,fsize?ICFappend:ICFcreate,_setcrc,compressor,NULL,fast);
  2023. return cfile;
  2024. }
  2025. ICompressedFileIO *createCompressedFileWriter(IFile *file,size32_t recordsize,bool append,bool _setcrc,ICompressor *compressor,bool fast, IFEflags extraFlags)
  2026. {
  2027. if (file) {
  2028. if (append&&!file->exists())
  2029. append = false;
  2030. Owned<IFileIO> fileio = file->open(append?IFOreadwrite:IFOcreate, extraFlags);
  2031. if (fileio)
  2032. return createCompressedFileWriter(fileio,recordsize,_setcrc,compressor,fast);
  2033. }
  2034. return NULL;
  2035. }
  2036. //===================================================================================
  2037. #define AES_PADDING_SIZE 32
  2038. class CAESCompressor : public CInterface, implements ICompressor
  2039. {
  2040. Owned<ICompressor> comp; // base compressor
  2041. MemoryAttr compattr; // compressed buffer
  2042. MemoryAttr outattr; // compressed and encrypted (if outblk NULL)
  2043. void *outbuf; // dest
  2044. size32_t outlen;
  2045. size32_t outmax;
  2046. MemoryAttr key;
  2047. public:
  2048. IMPLEMENT_IINTERFACE;
  2049. CAESCompressor(const void *_key, unsigned _keylen)
  2050. : key(_keylen,_key)
  2051. {
  2052. comp.setown(createLZWCompressor(true));
  2053. outlen = 0;
  2054. }
  2055. void open(void *blk,size32_t blksize)
  2056. {
  2057. outlen = 0;
  2058. outmax = blksize;
  2059. if (blk)
  2060. outbuf = blk;
  2061. else
  2062. outbuf = outattr.allocate(blksize);
  2063. size32_t subsz = blksize-AES_PADDING_SIZE-sizeof(size32_t);
  2064. comp->open(compattr.allocate(subsz),subsz);
  2065. }
  2066. void close()
  2067. {
  2068. comp->close();
  2069. // now encrypt
  2070. byte *p = (byte *)comp->bufptr();
  2071. size32_t bl = comp->buflen();
  2072. MemoryBuffer buf;
  2073. aesEncrypt(key.get(), key.length(), comp->bufptr(), comp->buflen(), buf);
  2074. outlen = buf.length();
  2075. memcpy(outbuf,&outlen,sizeof(size32_t));
  2076. outlen += sizeof(size32_t);
  2077. assertex(outlen<=outmax);
  2078. memcpy((byte *)outbuf+sizeof(size32_t),buf.bufferBase(),buf.length());
  2079. }
  2080. size32_t write(const void *buf,size32_t len)
  2081. {
  2082. return comp->write(buf,len);
  2083. }
  2084. void * bufptr()
  2085. {
  2086. return outbuf;
  2087. }
  2088. size32_t buflen()
  2089. {
  2090. return outlen;
  2091. }
  2092. void startblock()
  2093. {
  2094. comp->startblock();
  2095. }
  2096. void commitblock()
  2097. {
  2098. comp->commitblock();
  2099. }
  2100. };
  2101. class CAESExpander : public CInterface, implements IExpander
  2102. {
  2103. Owned<IExpander> exp; // base expander
  2104. MemoryBuffer compbuf;
  2105. MemoryAttr key;
  2106. public:
  2107. IMPLEMENT_IINTERFACE;
  2108. CAESExpander(const void *_key, unsigned _keylen)
  2109. : key(_keylen,_key)
  2110. {
  2111. exp.setown(createLZWExpander(true));
  2112. }
  2113. size32_t init(const void *blk)
  2114. {
  2115. // first decrypt
  2116. const byte *p = (const byte *)blk;
  2117. size32_t l = *(const size32_t *)p;
  2118. aesDecrypt(key.get(),key.length(),p+sizeof(size32_t),l,compbuf);
  2119. return exp->init(compbuf.bufferBase());
  2120. }
  2121. void expand(void *target)
  2122. {
  2123. exp->expand(target);
  2124. }
  2125. virtual void * bufptr()
  2126. {
  2127. return exp->bufptr();
  2128. }
  2129. virtual size32_t buflen()
  2130. {
  2131. return exp->buflen();
  2132. }
  2133. };
  2134. ICompressor *createAESCompressor(const void *key, unsigned keylen)
  2135. {
  2136. return new CAESCompressor(key,keylen);
  2137. }
  2138. IExpander *createAESExpander(const void *key, unsigned keylen)
  2139. {
  2140. return new CAESExpander(key,keylen);
  2141. }
  2142. #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (8 - (n))))
  2143. inline void padKey32(byte *keyout,size32_t len, const byte *key)
  2144. {
  2145. if (len==0)
  2146. memset(keyout,0xcc,32);
  2147. else if (len<=32) {
  2148. for (unsigned i=0;i<32;i++)
  2149. keyout[i] = (i<len)?key[i%len]:ROTATE_LEFT(key[i%len],i/len);
  2150. }
  2151. else {
  2152. memcpy(keyout,key,32);
  2153. // xor excess rotated
  2154. for (unsigned i=32;i<len;i++)
  2155. keyout[i%32] ^= ROTATE_LEFT(key[i],(i/8)%8);
  2156. }
  2157. }
  2158. ICompressor *createAESCompressor256(size32_t len, const void *key)
  2159. {
  2160. byte k[32];
  2161. padKey32(k,len,(const byte *)key);
  2162. return new CAESCompressor(k,32);
  2163. }
  2164. IExpander *createAESExpander256(size32_t len, const void *key)
  2165. {
  2166. byte k[32];
  2167. padKey32(k,len,(const byte *)key);
  2168. return new CAESExpander(k,32);
  2169. }
  2170. IPropertyTree *getBlockedFileDetails(IFile *file)
  2171. {
  2172. Owned<IPropertyTree> tree = createPTree("BlockedFile");
  2173. Owned<IFileIO> fileio = file?file->open(IFOread):NULL;
  2174. if (fileio) {
  2175. offset_t fsize = fileio->size();
  2176. tree->setPropInt64("@size",fsize);
  2177. if (fsize>=sizeof(WinCompressedFileTrailer)) { // thats 8 bytes bigger but I think doesn't matter
  2178. WinCompressedFileTrailer wintrailer;
  2179. CompressedFileTrailer trailer;
  2180. if (fileio->read(fsize-sizeof(WinCompressedFileTrailer),sizeof(WinCompressedFileTrailer),&wintrailer)==sizeof(WinCompressedFileTrailer)) {
  2181. wintrailer.translate(trailer);
  2182. if ((trailer.compressedType==COMPRESSEDFILEFLAG)||(trailer.compressedType==FASTCOMPRESSEDFILEFLAG)) {
  2183. trailer.setDetails(*tree);
  2184. unsigned nb = trailer.numBlocks();
  2185. MemoryAttr indexbuf;
  2186. size32_t toread = sizeof(offset_t)*nb;
  2187. size32_t r = fileio->read(trailer.indexPos,toread,indexbuf.allocate(toread));
  2188. if (r&&(r==toread)) {
  2189. offset_t s = 0;
  2190. const offset_t *index = (const offset_t *)indexbuf.bufferBase();
  2191. for (unsigned i=0;i<nb;i++) {
  2192. IPropertyTree * t = tree->addPropTree("Block",createPTree("Block"));
  2193. t->addPropInt64("@start",s);
  2194. offset_t p = s;
  2195. s = index[i];
  2196. t->addPropInt64("@end",s);
  2197. t->addPropInt64("@length",s-p);
  2198. }
  2199. }
  2200. return tree.getClear();
  2201. }
  2202. }
  2203. }
  2204. }
  2205. return NULL;
  2206. }
  2207. class CCompressHandlerArray : public IArrayOf<ICompressHandler>
  2208. {
  2209. public:
  2210. ICompressHandler *lookup(const char *type) const
  2211. {
  2212. ForEachItemIn(h, *this)
  2213. {
  2214. ICompressHandler &handler = item(h);
  2215. if (0 == stricmp(type, handler.queryType()))
  2216. return &handler;
  2217. }
  2218. return NULL;
  2219. }
  2220. } compressors;
  2221. bool addCompressorHandler(ICompressHandler *handler)
  2222. {
  2223. if (compressors.lookup(handler->queryType()))
  2224. {
  2225. handler->Release();
  2226. return false; // already registered
  2227. }
  2228. compressors.append(* handler);
  2229. return true;
  2230. }
  2231. bool removeCompressorHandler(ICompressHandler *handler)
  2232. {
  2233. return compressors.zap(* handler);
  2234. }
  2235. Linked<ICompressHandler> defaultCompressor;
  2236. MODULE_INIT(INIT_PRIORITY_STANDARD)
  2237. {
  2238. class CCompressHandlerBase : public CInterface, implements ICompressHandler
  2239. {
  2240. StringAttr type;
  2241. public:
  2242. IMPLEMENT_IINTERFACE;
  2243. CCompressHandlerBase(const char *_type) : type(_type) { }
  2244. // ICompressHandler
  2245. virtual const char *queryType() const { return type; }
  2246. };
  2247. class CFLZCompressHandler : public CCompressHandlerBase
  2248. {
  2249. public:
  2250. CFLZCompressHandler() : CCompressHandlerBase("FLZ") { }
  2251. virtual ICompressor *getCompressor(const char *options) { return createFastLZCompressor(); }
  2252. virtual IExpander *getExpander(const char *options) { return createFastLZExpander(); }
  2253. };
  2254. class CAESCompressHandler : public CCompressHandlerBase
  2255. {
  2256. public:
  2257. CAESCompressHandler() : CCompressHandlerBase("AES") { }
  2258. virtual ICompressor *getCompressor(const char *options)
  2259. {
  2260. assertex(options);
  2261. return createAESCompressor(options, strlen(options));
  2262. }
  2263. virtual IExpander *getExpander(const char *options)
  2264. {
  2265. assertex(options);
  2266. return createAESExpander(options, strlen(options));
  2267. }
  2268. };
  2269. class CDiffCompressHandler : public CCompressHandlerBase
  2270. {
  2271. public:
  2272. CDiffCompressHandler() : CCompressHandlerBase("DIFF") { }
  2273. virtual ICompressor *getCompressor(const char *options) { return createRDiffCompressor(); }
  2274. virtual IExpander *getExpander(const char *options) { return createRDiffExpander(); }
  2275. };
  2276. ICompressHandler *flzCompressor = new CFLZCompressHandler();
  2277. addCompressorHandler(flzCompressor);
  2278. addCompressorHandler(new CAESCompressHandler());
  2279. addCompressorHandler(new CDiffCompressHandler());
  2280. defaultCompressor.set(flzCompressor);
  2281. return true;
  2282. }
  2283. ICompressHandler *queryCompressHandler(const char *type)
  2284. {
  2285. return compressors.lookup(type);
  2286. }
  2287. void setDefaultCompressor(const char *type)
  2288. {
  2289. ICompressHandler *_defaultCompressor = queryCompressHandler(type);
  2290. if (!_defaultCompressor)
  2291. throw MakeStringException(-1, "setDefaultCompressor: '%s' compressor not registered", type);
  2292. defaultCompressor.set(_defaultCompressor);
  2293. }
  2294. ICompressHandler *queryDefaultCompressHandler()
  2295. {
  2296. return defaultCompressor;
  2297. }
  2298. ICompressor *getCompressor(const char *type, const char *options)
  2299. {
  2300. ICompressHandler *handler = compressors.lookup(type);
  2301. if (handler)
  2302. return handler->getCompressor(options);
  2303. return NULL;
  2304. }
  2305. IExpander *getExpander(const char *type, const char *options)
  2306. {
  2307. ICompressHandler *handler = compressors.lookup(type);
  2308. if (handler)
  2309. return handler->getExpander(options);
  2310. return NULL;
  2311. }
  2312. //===================================================================================
  2313. //#define TEST_ROWDIFF
  2314. #ifdef TEST_ROWDIFF
  2315. #include "jfile.hpp"
  2316. jlib_decl void testDiffComp(unsigned amount)
  2317. {
  2318. size32_t sz = 11;
  2319. Owned<IWriteSeqVar> out = createRowCompWriteSeq("test.out", sz);
  2320. { MTIME_SECTION(defaultTimer, "Comp Write");
  2321. int cpies;
  2322. for (cpies=0; cpies<amount; cpies++)
  2323. {
  2324. out->putn("Kate cccc \0A Another \0A Brother ", 3);
  2325. out->putn( "Jake Smith", 1);
  2326. out->putn( "Jake Brown", 1);
  2327. out->putn( "J Smith ", 1);
  2328. out->putn( "K Smith ", 1);
  2329. out->putn( "Kate Smith", 1);
  2330. out->putn( "Kate Brown", 1);
  2331. out->putn("Kate aaaa \0Kate bbbb ", 2);
  2332. out->putn("Kate cccc \0A Another \0A Brother ", 3);
  2333. out->putn( "A Brolley ", 1);
  2334. }
  2335. }
  2336. out.clear();
  2337. MemoryBuffer buf;
  2338. char *s = (char *) buf.reserve(sz);
  2339. { MTIME_SECTION(defaultTimer, "Comp read");
  2340. Owned<IReadSeqVar> in = createRowCompReadSeq("test.out", 0, sz);
  2341. count_t a = 0;
  2342. loop
  2343. {
  2344. size32_t tmpSz;
  2345. if (!in->get(sz, s, tmpSz))
  2346. break;
  2347. a++;
  2348. // DBGLOG("Entry: %s", s);
  2349. }
  2350. DBGLOG("read: %d", a);
  2351. }
  2352. { MTIME_SECTION(defaultTimer, "Comp read async std");
  2353. Owned<IFile> iFile = createIFile("test.out");
  2354. Owned<IFileAsyncIO> iFileIO = iFile->openAsync(IFOread);
  2355. Owned<IFileIOStream> iFileIOStream = createBufferedAsyncIOStream(iFileIO);
  2356. Owned<IReadSeqVar> in = createRowCompReadSeq(*iFileIOStream, 0, sz);
  2357. count_t a = 0;
  2358. loop
  2359. {
  2360. size32_t tmpSz;
  2361. if (!in->get(sz, s, tmpSz))
  2362. break;
  2363. a++;
  2364. // DBGLOG("Entry: %s", s);
  2365. }
  2366. DBGLOG("async std read: %d", a);
  2367. }
  2368. { MTIME_SECTION(defaultTimer, "Comp read async");
  2369. Owned<IReadSeqVar> in = createRowCompReadSeq("test.out", 0, sz, -1, true);
  2370. count_t a = 0;
  2371. loop
  2372. {
  2373. size32_t tmpSz;
  2374. if (!in->get(sz, s, tmpSz))
  2375. break;
  2376. a++;
  2377. // DBGLOG("Entry: %s", s);
  2378. }
  2379. DBGLOG("async read: %d", a);
  2380. }
  2381. }
  2382. #endif