archive.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "platform.h"
  14. #include "jlib.hpp"
  15. #include "jio.hpp"
  16. #include "jmutex.hpp"
  17. #include "jfile.hpp"
  18. #include "jlog.hpp"
  19. #include "jregexp.hpp"
  20. #include "archive.hpp"
  21. #include <sys/stat.h>
  22. #include <archive.h>
  23. #include <archive_entry.h>
  24. /*
  25. * Direct access to files in zip archives (and other libarchive-supported formats), without needing to extract them first
  26. * Installs hooks into createIFile, spotting filenames of the form /my/directory/myfile.zip/{password}/path/within/archive
  27. */
  28. #ifdef _WIN32
  29. #define ARCHIVE_SIGNATURE "[.]{zip|tar|tar[.]gz|tgz}{$|/|\\\\}"
  30. #else
  31. #define ARCHIVE_SIGNATURE "[.]{zip|tar|tar[.]gz|tgz}{$|/}"
  32. #endif
  33. static RegExpr *signature;
  34. static SpinLock *lock;
  35. static const char *splitName(const char *fileName)
  36. {
  37. if (!fileName)
  38. return NULL;
  39. SpinBlock b(*lock);
  40. const char *sig = signature->find(fileName);
  41. if (sig)
  42. return sig+signature->findlen();
  43. else
  44. return NULL;
  45. }
  46. static void splitArchivedFileName(const char *fullName, StringAttr &container, StringAttr &option, StringAttr &relPath)
  47. {
  48. const char *tail = splitName(fullName);
  49. assertex(tail);
  50. size_t containerLen = tail-fullName;
  51. if (fullName[containerLen-1]==PATHSEPCHAR)
  52. containerLen--;
  53. container.set(fullName, containerLen);
  54. if (*tail=='{')
  55. {
  56. tail++;
  57. const char *end = strchr(tail, '}');
  58. if (!end)
  59. throw MakeStringException(0, "Invalid archive-embedded filename - no matching } found");
  60. option.set(tail, end - tail);
  61. tail = end+1;
  62. if (*tail==PATHSEPCHAR)
  63. tail++;
  64. else if (*tail != 0)
  65. throw MakeStringException(0, "Invalid archive-embedded filename - " PATHSEPSTR " expected after }");
  66. }
  67. else
  68. option.clear();
  69. if (tail && *tail)
  70. {
  71. StringBuffer s(tail);
  72. s.replace(PATHSEPCHAR, '/');
  73. relPath.set(s);
  74. }
  75. else
  76. relPath.clear();
  77. }
  78. static StringBuffer & buildArchivedFileName(StringBuffer &fullname, const char *archiveFile, const char *option, const char *relPath)
  79. {
  80. fullname.append(archiveFile);
  81. if (option && *option)
  82. fullname.append(PATHSEPCHAR).append('{').append(option).append('}');
  83. if (relPath && *relPath)
  84. fullname.append(PATHSEPCHAR).append(relPath);
  85. return fullname;
  86. }
  87. IDirectoryIterator *createArchiveDirectoryIterator(const char *gitFileName, const char *mask, bool sub, bool includeDirs);
  88. // Wrapper around libarchive's archive_entry struct to ensure we free them at right time
  89. // Because not clear whether safe to use a struct archive_entry object after the archive has been closed,
  90. // we copy the info we need out of them into something we CAN be sure of the lifespan of
  91. class ArchiveEntry : public CInterface, implements IInterface
  92. {
  93. public:
  94. IMPLEMENT_IINTERFACE;
  95. ArchiveEntry(struct archive_entry *entry)
  96. {
  97. mode = archive_entry_filetype(entry);
  98. filesize = archive_entry_size(entry);
  99. path.set(archive_entry_pathname(entry));
  100. accessTime = archive_entry_atime(entry);
  101. createTime = archive_entry_ctime(entry);
  102. modifiedTime = archive_entry_mtime(entry);
  103. }
  104. bool isDir() const
  105. {
  106. return S_ISDIR(mode);
  107. }
  108. inline offset_t size()
  109. {
  110. return filesize;
  111. }
  112. const char *pathname()
  113. {
  114. return path.get();
  115. }
  116. CDateTime &getAccessTime(CDateTime &t)
  117. {
  118. t.set(accessTime);
  119. return t;
  120. }
  121. CDateTime &getCreateTime(CDateTime &t)
  122. {
  123. t.set(createTime);
  124. return t;
  125. }
  126. CDateTime &getModifiedTime(CDateTime &t)
  127. {
  128. t.set(modifiedTime);
  129. return t;
  130. }
  131. private:
  132. unsigned mode;
  133. offset_t filesize;
  134. StringAttr path;
  135. time_t accessTime;
  136. time_t createTime;
  137. time_t modifiedTime;
  138. };
  139. // IFileIO implementation for reading out of libarchive-supported archives
  140. // Because of the nature of the libarchive this may not be efficient for some archive formats
  141. // Have to read through the entire archive directory to find the bit you want, it seems
  142. // It's possible that we could add some seek support to at least avoid having to do so twice?
  143. class ArchiveFileIO : public CInterface, implements IFileIO
  144. {
  145. public:
  146. IMPLEMENT_IINTERFACE;
  147. ArchiveFileIO(const char *_fullName) : fullName(_fullName)
  148. {
  149. // Sadly it seems we can't use a saved entry to read data from an archive. We have to open a new archive
  150. // object and scan through until we find the matching file, in order to extract it.
  151. StringAttr container, option, relpath;
  152. splitArchivedFileName(_fullName, container, option, relpath);
  153. curPos = 0;
  154. lastPos = 0;
  155. curBuffSize = 0;
  156. curBuff = NULL;
  157. archive = archive_read_new();
  158. archive_read_support_format_all(archive);
  159. archive_read_support_compression_all(archive);
  160. int retcode = archive_read_open_filename(archive, container, 10240);
  161. if (retcode == ARCHIVE_OK)
  162. {
  163. struct archive_entry *entry = archive_entry_new();
  164. while (archive_read_next_header2(archive, entry) == ARCHIVE_OK)
  165. {
  166. const char *filename = archive_entry_pathname(entry);
  167. if (strcmp(filename, relpath.get())==0)
  168. {
  169. fileSize = archive_entry_size(entry);
  170. break;
  171. }
  172. }
  173. archive_entry_free(entry);
  174. }
  175. }
  176. ~ArchiveFileIO()
  177. {
  178. archive_read_finish(archive);
  179. }
  180. virtual size32_t read(offset_t pos, size32_t len, void * _data)
  181. {
  182. // NOTE - we don't support multithreaded access (the sequential-only restriction would make that tricky anyway)
  183. if (pos < lastPos)
  184. throw MakeStringException(0, "Only sequential access to contained file %s supported", fullName.get());
  185. byte *data = (byte *) _data;
  186. size32_t lenRequested = len;
  187. while (len > 0 & pos < fileSize)
  188. {
  189. if (pos >= curPos+curBuffSize)
  190. {
  191. int ret = archive_read_data_block(archive, &curBuff, &curBuffSize, &curPos);
  192. if (ret != ARCHIVE_OK)
  193. {
  194. if (ret == ARCHIVE_EOF)
  195. break; // This shouldn't happen if the quoted fileSize was accurate...
  196. else
  197. throw MakeStringException(0, "Read error reading contained file %s", fullName.get());
  198. }
  199. }
  200. else
  201. {
  202. // Copy as much of the current request as we can fulfil from this block
  203. offset_t buffOffset = (pos - curPos);
  204. size_t copyLen = (curBuffSize - buffOffset) > len ? len : curBuffSize - buffOffset; // careful for overflows, we are mixing 64/32bit values
  205. if (curBuff)
  206. memcpy(data, ((const byte *) curBuff) + buffOffset, copyLen);
  207. else
  208. memset(data, 0, copyLen); // Sparse areas of compressed files may be represented with NULL buffers
  209. data += copyLen;
  210. len -= copyLen;
  211. pos += copyLen;
  212. }
  213. }
  214. lastPos = pos;
  215. return lenRequested - len;
  216. }
  217. virtual offset_t size()
  218. {
  219. return fileSize;
  220. }
  221. virtual void close()
  222. {
  223. }
  224. // Write methods not implemented - this is a read-only file
  225. virtual size32_t write(offset_t pos, size32_t len, const void * data)
  226. {
  227. throwUnexpected();
  228. }
  229. virtual offset_t appendFile(IFile *file,offset_t pos=0,offset_t len=(offset_t)-1)
  230. {
  231. throwUnexpected();
  232. }
  233. virtual void setSize(offset_t size)
  234. {
  235. throwUnexpected();
  236. }
  237. virtual void flush()
  238. {
  239. throwUnexpected();
  240. }
  241. protected:
  242. struct archive *archive;
  243. offset_t fileSize;
  244. #if ARCHIVE_VERSION_NUMBER < 3000000
  245. off_t curPos;
  246. #else
  247. int64_t curPos;
  248. #endif
  249. offset_t lastPos;
  250. size_t curBuffSize;
  251. const void *curBuff;
  252. StringAttr fullName;
  253. };
  254. // IFile implementation for reading out of libarchive-supported archives
  255. // These use the struct_archive_entry objects allocated in the directory iterator
  256. // in the hope they might be useful for directly seeking to the file to be extracted
  257. // at some point.
  258. class ArchiveFile : public CInterface, implements IFile
  259. {
  260. public:
  261. IMPLEMENT_IINTERFACE;
  262. ArchiveFile(const char *_fileName, ArchiveEntry *_entry)
  263. : fullName(_fileName),entry(_entry)
  264. {
  265. }
  266. virtual bool exists()
  267. {
  268. return entry != NULL;
  269. }
  270. virtual bool getTime(CDateTime * createTime, CDateTime * modifiedTime, CDateTime * accessedTime)
  271. {
  272. if (entry)
  273. {
  274. if (accessedTime)
  275. entry->getAccessTime(*accessedTime);
  276. if (createTime)
  277. entry->getCreateTime(*createTime);
  278. if (modifiedTime)
  279. entry->getModifiedTime(*modifiedTime);
  280. return true;
  281. }
  282. else
  283. return false;
  284. }
  285. virtual fileBool isDirectory()
  286. {
  287. if (!entry)
  288. return notFound;
  289. return entry->isDir() ? foundYes : foundNo;
  290. }
  291. virtual fileBool isFile()
  292. {
  293. if (!entry)
  294. return notFound;
  295. return entry->isDir() ? foundNo : foundYes;
  296. }
  297. virtual fileBool isReadOnly()
  298. {
  299. if (!entry)
  300. return notFound;
  301. return foundYes;
  302. }
  303. virtual IFileIO * open(IFOmode mode)
  304. {
  305. assertex(mode==IFOread && entry != NULL);
  306. return new ArchiveFileIO(fullName.str());
  307. }
  308. virtual IFileAsyncIO * openAsync(IFOmode mode)
  309. {
  310. UNIMPLEMENTED;
  311. }
  312. virtual IFileIO * openShared(IFOmode mode, IFSHmode shmode)
  313. {
  314. assertex(mode==IFOread && entry != NULL);
  315. return new ArchiveFileIO(fullName.str());
  316. }
  317. virtual const char * queryFilename()
  318. {
  319. return fullName.str();
  320. }
  321. virtual offset_t size()
  322. {
  323. if (!entry)
  324. return 0;
  325. return entry->size();
  326. }
  327. // Directory functions
  328. virtual IDirectoryIterator *directoryFiles(const char *mask, bool sub, bool includeDirs)
  329. {
  330. if (isDirectory() != foundYes || (mask && !*mask)) // Empty mask string means matches nothing - NULL means matches everything
  331. return createNullDirectoryIterator();
  332. else
  333. {
  334. StringBuffer dirName(fullName);
  335. dirName.append(PATHSEPCHAR);
  336. return createArchiveDirectoryIterator(dirName, mask, sub, includeDirs);
  337. }
  338. }
  339. virtual bool getInfo(bool &_isdir,offset_t &_size,CDateTime &_modtime)
  340. {
  341. _isdir = isDirectory()==foundYes;
  342. _size = size();
  343. _modtime.clear(); // MORE could probably do better
  344. return true; // MORE should this be false if not existing?
  345. }
  346. // Not going to be implemented - this IFile interface is too big..
  347. virtual bool setTime(const CDateTime * createTime, const CDateTime * modifiedTime, const CDateTime * accessedTime) { UNIMPLEMENTED; }
  348. virtual bool remove() { UNIMPLEMENTED; }
  349. virtual void rename(const char *newTail) { UNIMPLEMENTED; }
  350. virtual void move(const char *newName) { UNIMPLEMENTED; }
  351. virtual void setReadOnly(bool ro) { UNIMPLEMENTED; }
  352. virtual bool setCompression(bool set) { UNIMPLEMENTED; }
  353. virtual offset_t compressedSize() { UNIMPLEMENTED; }
  354. virtual unsigned getCRC() { UNIMPLEMENTED; }
  355. virtual void setCreateFlags(unsigned cflags) { UNIMPLEMENTED; }
  356. virtual void setShareMode(IFSHmode shmode) { UNIMPLEMENTED; }
  357. virtual bool createDirectory() { UNIMPLEMENTED; }
  358. virtual IDirectoryDifferenceIterator *monitorDirectory(
  359. IDirectoryIterator *prev=NULL, // in (NULL means use current as baseline)
  360. const char *mask=NULL,
  361. bool sub=false,
  362. bool includedirs=false,
  363. unsigned checkinterval=60*1000,
  364. unsigned timeout=(unsigned)-1,
  365. Semaphore *abortsem=NULL) { UNIMPLEMENTED; }
  366. virtual void copySection(const RemoteFilename &dest, offset_t toOfs=(offset_t)-1, offset_t fromOfs=0, offset_t size=(offset_t)-1, ICopyFileProgress *progress=NULL) { UNIMPLEMENTED; }
  367. virtual void copyTo(IFile *dest, size32_t buffersize=0x100000, ICopyFileProgress *progress=NULL, bool usetmp=false) { UNIMPLEMENTED; }
  368. virtual IMemoryMappedFile *openMemoryMapped(offset_t ofs=0, memsize_t len=(memsize_t)-1, bool write=false) { UNIMPLEMENTED; }
  369. virtual void treeCopyTo(IFile *dest,IpSubNet &subnet,IpAddress &resfrom,bool usetmp=false) { UNIMPLEMENTED; }
  370. protected:
  371. StringBuffer fullName;
  372. Linked<ArchiveEntry> entry;
  373. };
  374. static IFile *createIFileInArchive(const char *containedFileName)
  375. {
  376. StringBuffer fname(containedFileName);
  377. assertex(fname.length());
  378. removeTrailingPathSepChar(fname);
  379. StringAttr container, option, relpath;
  380. splitArchivedFileName(fname.str(), container, option, relpath);
  381. if (relpath.length())
  382. {
  383. StringBuffer dirPath, dirTail;
  384. dirPath.append(container).append(option);
  385. splitFilename(relpath, &dirPath, &dirPath, &dirTail, &dirTail);
  386. Owned<IDirectoryIterator> dir = createArchiveDirectoryIterator(dirPath.str(), dirTail.str(), false, true);
  387. if (dir->first())
  388. {
  389. Linked<IFile> file = &dir->query();
  390. assertex(!dir->next());
  391. return file.getClear();
  392. }
  393. else
  394. return new ArchiveFile(containedFileName, NULL);
  395. }
  396. else
  397. {
  398. // Create an IFile representing the root of the archive as a directory
  399. struct archive_entry *rootEntry = archive_entry_new();
  400. archive_entry_set_pathname(rootEntry, ".");
  401. archive_entry_set_mode(rootEntry, S_IFDIR);
  402. archive_entry_set_size(rootEntry, 0);
  403. return new ArchiveFile(containedFileName, new ArchiveEntry(rootEntry));
  404. }
  405. }
  406. class ArchiveDirectoryIterator : public CInterface, implements IDirectoryIterator
  407. {
  408. public:
  409. IMPLEMENT_IINTERFACE;
  410. ArchiveDirectoryIterator(const char *_containedFileName, const char *_mask, bool _sub, bool _includeDirs)
  411. : mask(_mask), sub(_sub), includeDirs(_includeDirs)
  412. {
  413. splitArchivedFileName(_containedFileName, container, option, relDir);
  414. curIndex = 0;
  415. }
  416. virtual StringBuffer &getName(StringBuffer &buf)
  417. {
  418. assertex(curFile);
  419. return buf.append(curFile->queryFilename());
  420. }
  421. virtual bool isDir()
  422. {
  423. assertex(curFile);
  424. return curFile->isDirectory();
  425. }
  426. virtual __int64 getFileSize()
  427. {
  428. assertex(curFile);
  429. return curFile->size();
  430. }
  431. virtual bool getModifiedTime(CDateTime &ret)
  432. {
  433. UNIMPLEMENTED;
  434. }
  435. virtual bool first()
  436. {
  437. curFile.clear();
  438. entries.kill();
  439. curIndex = 0;
  440. struct archive *archive = archive_read_new();
  441. archive_read_support_format_all(archive);
  442. archive_read_support_compression_all(archive);
  443. int retcode = archive_read_open_filename(archive, container, 10240);
  444. if (retcode == ARCHIVE_OK)
  445. {
  446. struct archive_entry *entry = archive_entry_new();
  447. while (archive_read_next_header2(archive, entry) == ARCHIVE_OK)
  448. {
  449. unsigned mode = archive_entry_filetype(entry);
  450. bool isDir = S_ISDIR(mode);
  451. if (includeDirs || !isDir)
  452. {
  453. const char *filename = archive_entry_pathname(entry);
  454. if (memcmp(filename, relDir.get(), relDir.length())==0)
  455. {
  456. StringBuffer tail(filename + relDir.length());
  457. if (tail.length())
  458. {
  459. if (tail.charAt(tail.length()-1)=='/' || tail.charAt(tail.length()-1)==PATHSEPCHAR)
  460. tail.remove(tail.length()-1, 1);
  461. }
  462. else
  463. {
  464. assert(isDir);
  465. tail.append(".");
  466. }
  467. // Strip off a trailing /, then check that there is no / in the tail
  468. if (strchr(tail, PATHSEPCHAR) == NULL && (!mask.length() || WildMatch(tail, mask, false)))
  469. {
  470. entries.append(*new ArchiveEntry(entry));
  471. }
  472. }
  473. }
  474. }
  475. archive_entry_free(entry);
  476. }
  477. archive_read_finish(archive);
  478. return next();
  479. }
  480. virtual bool next()
  481. {
  482. if (entries.isItem(curIndex))
  483. {
  484. ArchiveEntry &entry = entries.item(curIndex);
  485. curIndex++;
  486. const char *filename = entry.pathname();
  487. StringBuffer containedFileName;
  488. buildArchivedFileName(containedFileName, container, option, filename);
  489. removeTrailingPathSepChar(containedFileName);
  490. curFile.setown(new ArchiveFile(containedFileName, &entry));
  491. return true;
  492. }
  493. else
  494. {
  495. curFile.clear();
  496. return false;
  497. }
  498. }
  499. virtual bool isValid() { return curFile != NULL; }
  500. virtual IFile & query() { return *curFile; }
  501. protected:
  502. StringAttr container;
  503. StringAttr option;
  504. StringAttr relDir;
  505. StringAttr mask;
  506. Owned<IFile> curFile;
  507. unsigned curIndex;
  508. IArrayOf<ArchiveEntry> entries; // The entries that matched
  509. bool includeDirs;
  510. bool sub;
  511. };
  512. IDirectoryIterator *createArchiveDirectoryIterator(const char *gitFileName, const char *mask, bool sub, bool includeDirs)
  513. {
  514. assertex(sub==false); // I don't know what it means!
  515. return new ArchiveDirectoryIterator(gitFileName, mask, sub, includeDirs);
  516. }
  517. class CArchiveFileHook : public CInterface, implements IContainedFileHook
  518. {
  519. public:
  520. IMPLEMENT_IINTERFACE;
  521. virtual IFile * createIFile(const char *fileName)
  522. {
  523. if (isArchiveFileName(fileName))
  524. return createIFileInArchive(fileName);
  525. else
  526. return NULL;
  527. }
  528. protected:
  529. static bool isArchiveFileName(const char *fileName)
  530. {
  531. if (fileName)
  532. return splitName(fileName) != NULL;
  533. return false;
  534. }
  535. } *archiveFileHook;
  536. extern ARCHIVEFILE_API void installFileHook()
  537. {
  538. SpinBlock b(*lock); // Probably overkill!
  539. if (!archiveFileHook)
  540. {
  541. archiveFileHook = new CArchiveFileHook;
  542. addContainedFileHook(archiveFileHook);
  543. }
  544. }
  545. extern ARCHIVEFILE_API void removeFileHook()
  546. {
  547. if (lock)
  548. {
  549. SpinBlock b(*lock); // Probably overkill!
  550. if (archiveFileHook)
  551. {
  552. removeContainedFileHook(archiveFileHook);
  553. delete archiveFileHook;
  554. archiveFileHook = NULL;
  555. }
  556. }
  557. }
  558. MODULE_INIT(INIT_PRIORITY_STANDARD)
  559. {
  560. lock = new SpinLock;
  561. signature = new RegExpr(ARCHIVE_SIGNATURE);
  562. archiveFileHook = NULL;
  563. return true;
  564. }
  565. MODULE_EXIT()
  566. {
  567. if (archiveFileHook)
  568. {
  569. removeContainedFileHook(archiveFileHook);
  570. archiveFileHook = NULL;
  571. }
  572. delete signature;
  573. delete lock;
  574. lock = NULL;
  575. signature = NULL;
  576. ::Release(archiveFileHook);
  577. }