jhash.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "platform.h"
  14. #include <limits.h>
  15. #include <ctype.h>
  16. #include <stdio.h>
  17. #include "jhash.hpp"
  18. #include "jmutex.hpp"
  19. #define PSTRINGDATA INT_MIN
  20. #define mix(a,b,c) \
  21. { \
  22. a -= b; a -= c; a ^= (c>>13); \
  23. b -= c; b -= a; b ^= (a<<8); \
  24. c -= a; c -= b; c ^= (b>>13); \
  25. a -= b; a -= c; a ^= (c>>12); \
  26. b -= c; b -= a; b ^= (a<<16); \
  27. c -= a; c -= b; c ^= (b>>5); \
  28. a -= b; a -= c; a ^= (c>>3); \
  29. b -= c; b -= a; b ^= (a<<10); \
  30. c -= a; c -= b; c ^= (b>>15); \
  31. }
  32. #define GETBYTE0(n) ((unsigned)k[n])
  33. #define GETBYTE1(n) ((unsigned)k[n+1]<<8)
  34. #define GETBYTE2(n) ((unsigned)k[n+2]<<16)
  35. #define GETBYTE3(n) ((unsigned)k[n+3]<<24)
  36. #define GETWORD(k,n) (GETBYTE0(n)+GETBYTE1(n)+GETBYTE2(n)+GETBYTE3(n))
  37. // the above looks inefficient but the compiler optimizes well
  38. // this hash looks slow but is about twice as quick as using our CRC table
  39. // and gives gives better results
  40. // (see paper at http://burtleburtle.net/bob/hash/evahash.html for more info)
  41. // Global atom table
  42. MODULE_INIT(INIT_PRIORITY_JHASH)
  43. {
  44. return true;
  45. }
  46. MODULE_EXIT()
  47. {
  48. // do not delete globalAtomTable as it will slow things down. If you
  49. // do not want these in your leak checking call releaseAtoms()
  50. }
  51. #define HASHONE(hash, c) { hash *= 0x01000193; hash ^= c; } // Fowler/Noll/Vo Hash... seems to work pretty well, and fast
  52. unsigned hashc( const unsigned char *k, unsigned length, unsigned initval)
  53. {
  54. unsigned hash = initval; // ^_rotl(length,2)??
  55. unsigned char c;
  56. while (length >= 8)
  57. {
  58. c = (*k++); HASHONE(hash, c);
  59. c = (*k++); HASHONE(hash, c);
  60. c = (*k++); HASHONE(hash, c);
  61. c = (*k++); HASHONE(hash, c);
  62. length-=4;
  63. }
  64. switch (length)
  65. {
  66. case 7: c = (*k++); HASHONE(hash, c); // fallthrough
  67. case 6: c = (*k++); HASHONE(hash, c); // fallthrough
  68. case 5: c = (*k++); HASHONE(hash, c); // fallthrough
  69. case 4: c = (*k++); HASHONE(hash, c); // fallthrough
  70. case 3: c = (*k++); HASHONE(hash, c); // fallthrough
  71. case 2: c = (*k++); HASHONE(hash, c); // fallthrough
  72. case 1: c = (*k++); HASHONE(hash, c);
  73. }
  74. return hash;
  75. }
  76. template <typename T>
  77. inline unsigned doHashValue( T value, unsigned initval)
  78. {
  79. //The values returned from this function are only consistent with those from hashn() if running on little endian architecture
  80. unsigned hash = initval;
  81. unsigned char c;
  82. for (unsigned i=0; i < sizeof(T); i++)
  83. {
  84. c = (byte)value;
  85. value >>= 8;
  86. HASHONE(hash, c);
  87. }
  88. return hash;
  89. }
  90. unsigned hashvalue( unsigned value, unsigned initval)
  91. {
  92. return doHashValue(value, initval);
  93. }
  94. unsigned hashvalue( unsigned __int64 value, unsigned initval)
  95. {
  96. return doHashValue(value, initval);
  97. }
  98. unsigned hashvalue( const void * value, unsigned initval)
  99. {
  100. return doHashValue((memsize_t)value, initval);
  101. }
  102. #define GETWORDNC(k,n) ((GETBYTE0(n)+GETBYTE1(n)+GETBYTE2(n)+GETBYTE3(n))&0xdfdfdfdf)
  103. unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval)
  104. {
  105. unsigned hash = initval;
  106. unsigned char c;
  107. while (length >= 8)
  108. {
  109. c = (*k++)&0xdf; HASHONE(hash, c);
  110. c = (*k++)&0xdf; HASHONE(hash, c);
  111. c = (*k++)&0xdf; HASHONE(hash, c);
  112. c = (*k++)&0xdf; HASHONE(hash, c);
  113. length-=4;
  114. }
  115. switch (length)
  116. {
  117. case 7: c = (*k++)&0xdf; HASHONE(hash, c); // fallthrough
  118. case 6: c = (*k++)&0xdf; HASHONE(hash, c); // fallthrough
  119. case 5: c = (*k++)&0xdf; HASHONE(hash, c); // fallthrough
  120. case 4: c = (*k++)&0xdf; HASHONE(hash, c); // fallthrough
  121. case 3: c = (*k++)&0xdf; HASHONE(hash, c); // fallthrough
  122. case 2: c = (*k++)&0xdf; HASHONE(hash, c); // fallthrough
  123. case 1: c = (*k++)&0xdf; HASHONE(hash, c);
  124. }
  125. return hash;
  126. }
  127. MappingKey::MappingKey(const void * inKey, int keysize)
  128. {
  129. int ksm = keysize;
  130. if (!ksm)
  131. ksm = (size32_t)strlen((char *) inKey) + 1;
  132. else if (ksm<0)
  133. {
  134. if (ksm==PSTRINGDATA) {
  135. ksm = (*((unsigned char *)inKey))+1;
  136. }
  137. else {
  138. ksm = -ksm;
  139. ksm += (size32_t)strlen((char *) inKey + ksm) + 1;
  140. }
  141. }
  142. void * temp = malloc(ksm);
  143. memcpy(temp, inKey, ksm);
  144. key = temp;
  145. }
  146. //-- Mapping ---------------------------------------------------
  147. unsigned MappingBase::getHash() const { return hash; }
  148. void MappingBase::setHash(unsigned hval){ hash = hval; }
  149. const void * Mapping::getKey() const { return key.key; }
  150. //-- HashMapping ---------------------------------------------------
  151. //const void * HashMapping::getKey() const { return key.key; }
  152. //-- Atom ---------------------------------------------------
  153. unsigned AtomBase::getHash() const { return hash; }
  154. void AtomBase::setHash(unsigned hval) { hash = hval; }
  155. const void * AtomBase::getKey() const { return key; }
  156. //-- Case Atom ---------------------------------------------------
  157. CaseAtom::CaseAtom(const void * k) : hash(0)
  158. {
  159. text = strdup((const char *)k);
  160. lower = createLowerCaseAtom(text);
  161. }
  162. //-- HashTable ---------------------------------------------------
  163. bool HashTable::addOwn(IMapping & donor)
  164. {
  165. if(add(donor))
  166. {
  167. donor.Release();
  168. return true;
  169. }
  170. return false;
  171. }
  172. bool HashTable::replaceOwn(IMapping & donor)
  173. {
  174. if(replace(donor))
  175. {
  176. donor.Release();
  177. return true;
  178. }
  179. return false;
  180. }
  181. IMapping * HashTable::findLink(const void * findParam) const
  182. {
  183. IMapping * found = SuperHashTableOf<IMapping, const void>::find(findParam);
  184. if(found) found->Link();
  185. return found;
  186. }
  187. bool HashTable::onNotify(INotification & notify)
  188. {
  189. bool ret = true;
  190. if (notify.getAction() == NotifyOnDispose)
  191. {
  192. IMapping * mapping = static_cast<IMapping *>(notify.querySource());
  193. ret = removeExact(mapping);
  194. assertex(ret);
  195. }
  196. return ret;
  197. }
  198. bool HashTable::keyeq(const void *key1, const void *key2, int ksize) const
  199. {
  200. if (ksize<=0)
  201. {
  202. if (ksize<0)
  203. {
  204. if (ksize==PSTRINGDATA)
  205. return (memcmp(key1,key2,(*(unsigned char*)key1)) == 0);
  206. unsigned ksm = -ksize;
  207. if (memcmp(key1,key2,ksm))
  208. return false;
  209. key1 = (char *)key1 + ksm;
  210. key2 = (char *)key2 + ksm;
  211. }
  212. if (ignorecase)
  213. {
  214. unsigned char *k1 = (unsigned char *) key1;
  215. unsigned char *k2 = (unsigned char *) key2;
  216. for (;;)
  217. {
  218. unsigned char c1 = toupper(*k1);
  219. if (c1 != toupper(*k2))
  220. return false;
  221. if (c1 == 0)
  222. return true;
  223. k1++;
  224. k2++;
  225. }
  226. }
  227. return strcmp((char *)key1, (char *)key2) == 0;
  228. }
  229. return memcmp(key1,key2,ksize)==0;
  230. }
  231. unsigned HashTable::hash(const void *key, int ksize) const
  232. {
  233. unsigned h = 0x811C9DC5;
  234. unsigned char *bp = (unsigned char *) key;
  235. if (ksize<=0)
  236. {
  237. if (ksize==PSTRINGDATA) {
  238. ksize = (*(unsigned char*)key)+1;
  239. goto BlockHash;
  240. }
  241. if (ksize<0) {
  242. h = hashc(bp,-ksize,h);
  243. bp -= ksize;
  244. }
  245. unsigned char* ks = bp;
  246. while (*bp) bp++;
  247. if (ignorecase)
  248. h = hashnc(ks,(size32_t)(bp-ks),h);
  249. else
  250. h = hashc(ks,(size32_t)(bp-ks),h);
  251. }
  252. else
  253. {
  254. BlockHash:
  255. if (ignorecase)
  256. h = hashnc(bp,ksize,h);
  257. else
  258. h = hashc(bp,ksize,h);
  259. }
  260. //Strings that only differ by a single character don't generate hashes very far apart without this
  261. h *= 0x01000193;
  262. //h = ((h << 7) ^ (h >> 25));
  263. return h;
  264. }
  265. IMapping * HashTable::createLink(const void *key)
  266. {
  267. IMapping * match = findLink(key);
  268. if (!match)
  269. {
  270. match = newMapping(key);
  271. if (match) add(*match); // link for hash table
  272. }
  273. return match;
  274. }
  275. IMapping *HashTable::newMapping(const void *)
  276. {
  277. assertex(!"Newmapping must be overridden to use HashTable::Create");
  278. return NULL;
  279. }
  280. // -- Helper functions...
  281. void IntersectHash(HashTable & h1, HashTable & h2)
  282. {
  283. HashIterator iter(h1);
  284. iter.first();
  285. while (iter.isValid())
  286. {
  287. IMapping & cur = iter.query();
  288. IMapping * matched = h2.find(cur.getKey());
  289. iter.next();
  290. if (!matched)
  291. h1.removeExact(&cur);
  292. }
  293. }
  294. void UnionHash(HashTable & h1, HashTable & h2)
  295. {
  296. HashIterator iter(h2);
  297. iter.first();
  298. while (iter.isValid())
  299. {
  300. IMapping & cur = iter.query();
  301. IMapping * matched = h1.find(cur.getKey());
  302. if (!matched)
  303. h1.add(cur);
  304. iter.next();
  305. }
  306. }
  307. void SubtractHash(HashTable & main, HashTable & sub)
  308. {
  309. HashIterator iter(sub);
  310. iter.first();
  311. while (iter.isValid())
  312. {
  313. IMapping & cur = iter.query();
  314. IMapping * matched = main.find(cur.getKey());
  315. iter.next();
  316. if (matched)
  317. main.removeExact(&cur);
  318. }
  319. }
  320. //===========================================================================
  321. void KeptHashTable::onAdd(void * et)
  322. {
  323. static_cast<IMapping *>(et)->Link();
  324. }
  325. void KeptHashTable::onRemove(void * et)
  326. {
  327. static_cast<IMapping *>(et)->Release();
  328. }
  329. IMapping * KeptHashTable::create(const void *key)
  330. {
  331. IMapping * match = find(key);
  332. if (!match)
  333. {
  334. match = newMapping(key);
  335. if (match) addOwn(*match); // already linked for hash table
  336. }
  337. return match;
  338. }
  339. void ObservedHashTable::onAdd(void * et)
  340. {
  341. static_cast<IMapping *>(et)->addObserver(*this);
  342. }
  343. void ObservedHashTable::onRemove(void * et)
  344. {
  345. static_cast<IMapping *>(et)->removeObserver(*this);
  346. }
  347. //===========================================================================
  348. static CriticalSection atomCrit;
  349. static KeptLowerCaseAtomTable *globalAtomTable = NULL;
  350. inline KeptLowerCaseAtomTable * queryGlobalAtomTable()
  351. {
  352. if (!globalAtomTable)
  353. {
  354. globalAtomTable = new KeptLowerCaseAtomTable;
  355. globalAtomTable->reinit(2000);
  356. }
  357. return globalAtomTable;
  358. }
  359. extern jlib_decl IAtom * createAtom(const char *value)
  360. {
  361. if (!value) return NULL;
  362. CriticalBlock crit(atomCrit);
  363. return queryGlobalAtomTable()->addAtom(value);
  364. }
  365. extern jlib_decl IAtom * createAtom(const char *value, size32_t len)
  366. {
  367. if (!value || !len)
  368. return NULL;
  369. char * nullTerminated = (char *)alloca(len+1);
  370. memcpy(nullTerminated, value, len);
  371. nullTerminated[len] = 0;
  372. CriticalBlock crit(atomCrit);
  373. return queryGlobalAtomTable()->addAtom(nullTerminated);
  374. }
  375. //===========================================================================
  376. static CriticalSection caseAtomCrit;
  377. static KeptCaseAtomTable *globalCaseAtomTable = NULL;
  378. inline KeptCaseAtomTable * queryGlobalCaseAtomTable()
  379. {
  380. if (!globalCaseAtomTable)
  381. {
  382. globalCaseAtomTable = new KeptCaseAtomTable;
  383. globalCaseAtomTable->reinit(2000);
  384. }
  385. return globalCaseAtomTable;
  386. }
  387. extern jlib_decl IIdAtom * createIdAtom(const char *value)
  388. {
  389. if (!value) return NULL;
  390. CriticalBlock crit(caseAtomCrit);
  391. return queryGlobalCaseAtomTable()->addAtom(value);
  392. }
  393. extern jlib_decl IIdAtom * createIdAtom(const char *value, size32_t len)
  394. {
  395. if (!value || !len)
  396. return NULL;
  397. char * nullTerminated = (char *)alloca(len+1);
  398. memcpy(nullTerminated, value, len);
  399. nullTerminated[len] = 0;
  400. CriticalBlock crit(caseAtomCrit);
  401. return queryGlobalCaseAtomTable()->addAtom(nullTerminated);
  402. }
  403. #ifdef THE_GLOBAL_HASH_TABLE_BECOMES_CASE_SENSITIVE
  404. extern jlib_decl IAtom * createLowerCaseAtom(const char *value)
  405. {
  406. if (!value) return NULL;
  407. unsigned len = strlen(value);
  408. const byte * src = (const byte *)value;
  409. char * lower = (char *)alloca(len+1);
  410. for (unsigned i=0; i < len; i++)
  411. lower[i] = tolower(src[i]);
  412. lower[len] = 0;
  413. CriticalBlock crit(atomCrit);
  414. return queryGlobalAtomTable()->addAtom(value);
  415. }
  416. extern jlib_decl IAtom * createLowerCaseAtom(const char *value, size32_t len)
  417. {
  418. if (!value || !len)
  419. return NULL;
  420. const byte * src = (const byte *)value;
  421. char * lower = (char *)alloca(len+1);
  422. for (unsigned i=0; i < len; i++)
  423. lower[i] = tolower(src[i]);
  424. lower[len] = 0;
  425. CriticalBlock crit(atomCrit);
  426. return queryGlobalAtomTable()->addAtom(lower);
  427. }
  428. #endif
  429. extern jlib_decl void releaseAtoms()
  430. {
  431. delete globalCaseAtomTable;
  432. globalCaseAtomTable = NULL;
  433. delete globalAtomTable;
  434. globalAtomTable = NULL;
  435. }