archive.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "platform.h"
  14. #include "jlib.hpp"
  15. #include "jio.hpp"
  16. #include "jmutex.hpp"
  17. #include "jfile.hpp"
  18. #include "jlog.hpp"
  19. #include "jregexp.hpp"
  20. #include "archive.hpp"
  21. #include <sys/stat.h>
  22. #include <archive.h>
  23. #include <archive_entry.h>
  24. /*
  25. * Direct access to files in zip archives (and other libarchive-supported formats), without needing to extract them first
  26. * Installs hooks into createIFile, spotting filenames of the form /my/directory/myfile.zip/{password}/path/within/archive
  27. */
  28. #define ARCHIVE_SIGNATURE "[.]{zip|tar|tar[.]gz|tgz}{$|"PATHSEPSTR"}"
  29. static RegExpr *signature;
  30. static SpinLock *lock;
  31. static const char *splitName(const char *fileName)
  32. {
  33. if (!fileName)
  34. return NULL;
  35. SpinBlock b(*lock);
  36. const char *sig = signature->find(fileName);
  37. if (sig)
  38. return sig+signature->findlen();
  39. else
  40. return NULL;
  41. }
  42. static void splitArchivedFileName(const char *fullName, StringAttr &container, StringAttr &option, StringAttr &relPath)
  43. {
  44. const char *tail = splitName(fullName);
  45. assertex(tail);
  46. size_t containerLen = tail-fullName;
  47. if (fullName[containerLen-1]==PATHSEPCHAR)
  48. containerLen--;
  49. container.set(fullName, containerLen);
  50. if (*tail=='{')
  51. {
  52. tail++;
  53. const char *end = strchr(tail, '}');
  54. if (!end)
  55. throw MakeStringException(0, "Invalid archive-embedded filename - no matching } found");
  56. option.set(tail, end - tail);
  57. tail = end+1;
  58. if (*tail==PATHSEPCHAR)
  59. tail++;
  60. else if (*tail != 0)
  61. throw MakeStringException(0, "Invalid archive-embedded filename - " PATHSEPSTR " expected after }");
  62. }
  63. else
  64. option.clear();
  65. if (tail && *tail)
  66. {
  67. StringBuffer s(tail);
  68. s.replace(PATHSEPCHAR, '/');
  69. relPath.set(s);
  70. }
  71. else
  72. relPath.clear();
  73. }
  74. static StringBuffer & buildArchivedFileName(StringBuffer &fullname, const char *archiveFile, const char *option, const char *relPath)
  75. {
  76. fullname.append(archiveFile);
  77. if (option && *option)
  78. fullname.append(PATHSEPCHAR).append('{').append(option).append('}');
  79. if (relPath && *relPath)
  80. fullname.append(PATHSEPCHAR).append(relPath);
  81. return fullname;
  82. }
  83. IDirectoryIterator *createArchiveDirectoryIterator(const char *gitFileName, const char *mask, bool sub, bool includeDirs);
  84. // Wrapper around libarchive's archive_entry struct to ensure we free them at right time
  85. // Because not clear whether safe to use a struct archive_entry object after the archive has been closed,
  86. // we copy the info we need out of them into something we CAN be sure of the lifespan of
  87. class ArchiveEntry : public CInterface, implements IInterface
  88. {
  89. public:
  90. IMPLEMENT_IINTERFACE;
  91. ArchiveEntry(struct archive_entry *entry)
  92. {
  93. mode = archive_entry_filetype(entry);
  94. filesize = archive_entry_size(entry);
  95. path.set(archive_entry_pathname(entry));
  96. }
  97. bool isDir() const
  98. {
  99. return S_ISDIR(mode);
  100. }
  101. inline offset_t size()
  102. {
  103. return filesize;
  104. }
  105. const char *pathname()
  106. {
  107. return path.get();
  108. }
  109. private:
  110. unsigned mode;
  111. offset_t filesize;
  112. StringAttr path;
  113. };
  114. // IFileIO implementation for reading out of libarchive-supported archives
  115. // Because of the nature of the libarchive this may not be efficient for some archive formats
  116. // Have to read through the entire archive directory to find the bit you want, it seems
  117. // It's possible that we could add some seek support to at least avoid having to do so twice?
  118. class ArchiveFileIO : public CInterface, implements IFileIO
  119. {
  120. public:
  121. IMPLEMENT_IINTERFACE;
  122. ArchiveFileIO(const char *_fullName) : fullName(_fullName)
  123. {
  124. // Sadly it seems we can't use a saved entry to read data from an archive. We have to open a new archive
  125. // object and scan through until we find the matching file, in order to extract it.
  126. StringAttr container, option, relpath;
  127. splitArchivedFileName(_fullName, container, option, relpath);
  128. curPos = 0;
  129. lastPos = 0;
  130. curBuffSize = 0;
  131. curBuff = NULL;
  132. archive = archive_read_new();
  133. archive_read_support_format_all(archive);
  134. archive_read_support_compression_all(archive);
  135. int retcode = archive_read_open_filename(archive, container, 10240);
  136. if (retcode == ARCHIVE_OK)
  137. {
  138. struct archive_entry *entry = archive_entry_new();
  139. while (archive_read_next_header2(archive, entry) == ARCHIVE_OK)
  140. {
  141. const char *filename = archive_entry_pathname(entry);
  142. if (strcmp(filename, relpath.get())==0)
  143. {
  144. fileSize = archive_entry_size(entry);
  145. break;
  146. }
  147. }
  148. archive_entry_free(entry);
  149. }
  150. }
  151. ~ArchiveFileIO()
  152. {
  153. archive_read_finish(archive);
  154. }
  155. virtual size32_t read(offset_t pos, size32_t len, void * _data)
  156. {
  157. // NOTE - we don't support multithreaded access (the sequential-only restriction would make that tricky anyway)
  158. if (pos < lastPos)
  159. throw MakeStringException(0, "Only sequential access to contained file %s supported", fullName.get());
  160. byte *data = (byte *) _data;
  161. size32_t lenRequested = len;
  162. while (len > 0 & pos < fileSize)
  163. {
  164. if (pos >= curPos+curBuffSize)
  165. {
  166. int ret = archive_read_data_block(archive, &curBuff, &curBuffSize, &curPos);
  167. if (ret != ARCHIVE_OK)
  168. {
  169. if (ret == ARCHIVE_EOF)
  170. break; // This shouldn't happen if the quoted fileSize was accurate...
  171. else
  172. throw MakeStringException(0, "Read error reading contained file %s", fullName.get());
  173. }
  174. }
  175. else
  176. {
  177. // Copy as much of the current request as we can fulfil from this block
  178. offset_t buffOffset = (pos - curPos);
  179. size_t copyLen = (curBuffSize - buffOffset) > len ? len : curBuffSize - buffOffset; // careful for overflows, we are mixing 64/32bit values
  180. if (curBuff)
  181. memcpy(data, ((const byte *) curBuff) + buffOffset, copyLen);
  182. else
  183. memset(data, 0, copyLen); // Sparse areas of compressed files may be represented with NULL buffers
  184. data += copyLen;
  185. len -= copyLen;
  186. pos += copyLen;
  187. }
  188. }
  189. lastPos = pos;
  190. return lenRequested - len;
  191. }
  192. virtual offset_t size()
  193. {
  194. return fileSize;
  195. }
  196. virtual void close()
  197. {
  198. }
  199. // Write methods not implemented - this is a read-only file
  200. virtual size32_t write(offset_t pos, size32_t len, const void * data)
  201. {
  202. throwUnexpected();
  203. }
  204. virtual offset_t appendFile(IFile *file,offset_t pos=0,offset_t len=(offset_t)-1)
  205. {
  206. throwUnexpected();
  207. }
  208. virtual void setSize(offset_t size)
  209. {
  210. throwUnexpected();
  211. }
  212. virtual void flush()
  213. {
  214. throwUnexpected();
  215. }
  216. protected:
  217. struct archive *archive;
  218. offset_t fileSize;
  219. #if ARCHIVE_VERSION_NUMBER < 3000000
  220. off_t curPos;
  221. #else
  222. int64_t curPos;
  223. #endif
  224. offset_t lastPos;
  225. size_t curBuffSize;
  226. const void *curBuff;
  227. StringAttr fullName;
  228. };
  229. // IFile implementation for reading out of libarchive-supported archives
  230. // These use the struct_archive_entry objects allocated in the directory iterator
  231. // in the hope they might be useful for directly seeking to the file to be extracted
  232. // at some point.
  233. class ArchiveFile : public CInterface, implements IFile
  234. {
  235. public:
  236. IMPLEMENT_IINTERFACE;
  237. ArchiveFile(const char *_fileName, ArchiveEntry *_entry)
  238. : fullName(_fileName),entry(_entry)
  239. {
  240. }
  241. virtual bool exists()
  242. {
  243. return entry != NULL;
  244. }
  245. virtual bool getTime(CDateTime * createTime, CDateTime * modifiedTime, CDateTime * accessedTime)
  246. {
  247. UNIMPLEMENTED; // MORE - maybe could implement if required
  248. }
  249. virtual fileBool isDirectory()
  250. {
  251. if (!entry)
  252. return notFound;
  253. return entry->isDir() ? foundYes : foundNo;
  254. }
  255. virtual fileBool isFile()
  256. {
  257. if (!entry)
  258. return notFound;
  259. return entry->isDir() ? foundNo : foundYes;
  260. }
  261. virtual fileBool isReadOnly()
  262. {
  263. if (!entry)
  264. return notFound;
  265. return foundYes;
  266. }
  267. virtual IFileIO * open(IFOmode mode)
  268. {
  269. assertex(mode==IFOread && entry != NULL);
  270. return new ArchiveFileIO(fullName.str());
  271. }
  272. virtual IFileAsyncIO * openAsync(IFOmode mode)
  273. {
  274. UNIMPLEMENTED;
  275. }
  276. virtual IFileIO * openShared(IFOmode mode, IFSHmode shmode)
  277. {
  278. assertex(mode==IFOread && entry != NULL);
  279. return new ArchiveFileIO(fullName.str());
  280. }
  281. virtual const char * queryFilename()
  282. {
  283. return fullName.str();
  284. }
  285. virtual offset_t size()
  286. {
  287. if (!entry)
  288. return 0;
  289. return entry->size();
  290. }
  291. // Directory functions
  292. virtual IDirectoryIterator *directoryFiles(const char *mask, bool sub, bool includeDirs)
  293. {
  294. if (isDirectory() != foundYes || (mask && !*mask)) // Empty mask string means matches nothing - NULL means matches everything
  295. return createNullDirectoryIterator();
  296. else
  297. {
  298. StringBuffer dirName(fullName);
  299. dirName.append(PATHSEPCHAR);
  300. return createArchiveDirectoryIterator(dirName, mask, sub, includeDirs);
  301. }
  302. }
  303. virtual bool getInfo(bool &_isdir,offset_t &_size,CDateTime &_modtime)
  304. {
  305. _isdir = isDirectory()==foundYes;
  306. _size = size();
  307. _modtime.clear(); // MORE could probably do better
  308. return true; // MORE should this be false if not existing?
  309. }
  310. // Not going to be implemented - this IFile interface is too big..
  311. virtual bool setTime(const CDateTime * createTime, const CDateTime * modifiedTime, const CDateTime * accessedTime) { UNIMPLEMENTED; }
  312. virtual bool remove() { UNIMPLEMENTED; }
  313. virtual void rename(const char *newTail) { UNIMPLEMENTED; }
  314. virtual void move(const char *newName) { UNIMPLEMENTED; }
  315. virtual void setReadOnly(bool ro) { UNIMPLEMENTED; }
  316. virtual bool setCompression(bool set) { UNIMPLEMENTED; }
  317. virtual offset_t compressedSize() { UNIMPLEMENTED; }
  318. virtual unsigned getCRC() { UNIMPLEMENTED; }
  319. virtual void setCreateFlags(unsigned cflags) { UNIMPLEMENTED; }
  320. virtual void setShareMode(IFSHmode shmode) { UNIMPLEMENTED; }
  321. virtual bool createDirectory() { UNIMPLEMENTED; }
  322. virtual IDirectoryDifferenceIterator *monitorDirectory(
  323. IDirectoryIterator *prev=NULL, // in (NULL means use current as baseline)
  324. const char *mask=NULL,
  325. bool sub=false,
  326. bool includedirs=false,
  327. unsigned checkinterval=60*1000,
  328. unsigned timeout=(unsigned)-1,
  329. Semaphore *abortsem=NULL) { UNIMPLEMENTED; }
  330. virtual void copySection(const RemoteFilename &dest, offset_t toOfs=(offset_t)-1, offset_t fromOfs=0, offset_t size=(offset_t)-1, ICopyFileProgress *progress=NULL) { UNIMPLEMENTED; }
  331. virtual void copyTo(IFile *dest, size32_t buffersize=0x100000, ICopyFileProgress *progress=NULL, bool usetmp=false) { UNIMPLEMENTED; }
  332. virtual IMemoryMappedFile *openMemoryMapped(offset_t ofs=0, memsize_t len=(memsize_t)-1, bool write=false) { UNIMPLEMENTED; }
  333. virtual void treeCopyTo(IFile *dest,IpSubNet &subnet,IpAddress &resfrom,bool usetmp=false) { UNIMPLEMENTED; }
  334. protected:
  335. StringBuffer fullName;
  336. Linked<ArchiveEntry> entry;
  337. };
  338. static IFile *createIFileInArchive(const char *containedFileName)
  339. {
  340. StringBuffer fname(containedFileName);
  341. assertex(fname.length());
  342. removeTrailingPathSepChar(fname);
  343. StringAttr container, option, relpath;
  344. splitArchivedFileName(fname.str(), container, option, relpath);
  345. if (relpath.length())
  346. {
  347. StringBuffer dirPath, dirTail;
  348. dirPath.append(container).append(option);
  349. splitFilename(relpath, &dirPath, &dirPath, &dirTail, &dirTail);
  350. Owned<IDirectoryIterator> dir = createArchiveDirectoryIterator(dirPath.str(), dirTail.str(), false, true);
  351. if (dir->first())
  352. {
  353. Linked<IFile> file = &dir->query();
  354. assertex(!dir->next());
  355. return file.getClear();
  356. }
  357. else
  358. return new ArchiveFile(containedFileName, NULL);
  359. }
  360. else
  361. {
  362. // Create an IFile representing the root of the archive as a directory
  363. struct archive_entry *rootEntry = archive_entry_new();
  364. archive_entry_set_pathname(rootEntry, ".");
  365. archive_entry_set_mode(rootEntry, S_IFDIR);
  366. archive_entry_set_size(rootEntry, 0);
  367. return new ArchiveFile(containedFileName, new ArchiveEntry(rootEntry));
  368. }
  369. }
  370. class ArchiveDirectoryIterator : public CInterface, implements IDirectoryIterator
  371. {
  372. public:
  373. IMPLEMENT_IINTERFACE;
  374. ArchiveDirectoryIterator(const char *_containedFileName, const char *_mask, bool _sub, bool _includeDirs)
  375. : mask(_mask), sub(_sub), includeDirs(_includeDirs)
  376. {
  377. splitArchivedFileName(_containedFileName, container, option, relDir);
  378. curIndex = 0;
  379. }
  380. virtual StringBuffer &getName(StringBuffer &buf)
  381. {
  382. assertex(curFile);
  383. return buf.append(curFile->queryFilename());
  384. }
  385. virtual bool isDir()
  386. {
  387. assertex(curFile);
  388. return curFile->isDirectory();
  389. }
  390. virtual __int64 getFileSize()
  391. {
  392. assertex(curFile);
  393. return curFile->size();
  394. }
  395. virtual bool getModifiedTime(CDateTime &ret)
  396. {
  397. UNIMPLEMENTED;
  398. }
  399. virtual bool first()
  400. {
  401. curFile.clear();
  402. entries.kill();
  403. curIndex = 0;
  404. struct archive *archive = archive_read_new();
  405. archive_read_support_format_all(archive);
  406. archive_read_support_compression_all(archive);
  407. int retcode = archive_read_open_filename(archive, container, 10240);
  408. if (retcode == ARCHIVE_OK)
  409. {
  410. struct archive_entry *entry = archive_entry_new();
  411. while (archive_read_next_header2(archive, entry) == ARCHIVE_OK)
  412. {
  413. unsigned mode = archive_entry_filetype(entry);
  414. bool isDir = S_ISDIR(mode);
  415. if (includeDirs || !isDir)
  416. {
  417. const char *filename = archive_entry_pathname(entry);
  418. if (memcmp(filename, relDir.get(), relDir.length())==0)
  419. {
  420. StringBuffer tail(filename + relDir.length());
  421. if (tail.length())
  422. {
  423. if (tail.charAt(tail.length()-1)=='/' || tail.charAt(tail.length()-1)==PATHSEPCHAR)
  424. tail.remove(tail.length()-1, 1);
  425. }
  426. else
  427. {
  428. assert(isDir);
  429. tail.append(".");
  430. }
  431. // Strip off a trailing /, then check that there is no / in the tail
  432. if (strchr(tail, PATHSEPCHAR) == NULL && (!mask.length() || WildMatch(tail, mask, false)))
  433. {
  434. entries.append(*new ArchiveEntry(entry));
  435. }
  436. }
  437. }
  438. }
  439. archive_entry_free(entry);
  440. }
  441. archive_read_finish(archive);
  442. return next();
  443. }
  444. virtual bool next()
  445. {
  446. if (entries.isItem(curIndex))
  447. {
  448. ArchiveEntry &entry = entries.item(curIndex);
  449. curIndex++;
  450. const char *filename = entry.pathname();
  451. StringBuffer containedFileName;
  452. buildArchivedFileName(containedFileName, container, option, filename);
  453. removeTrailingPathSepChar(containedFileName);
  454. curFile.setown(new ArchiveFile(containedFileName, &entry));
  455. return true;
  456. }
  457. else
  458. {
  459. curFile.clear();
  460. return false;
  461. }
  462. }
  463. virtual bool isValid() { return curFile != NULL; }
  464. virtual IFile & query() { return *curFile; }
  465. protected:
  466. StringAttr container;
  467. StringAttr option;
  468. StringAttr relDir;
  469. StringAttr mask;
  470. Owned<IFile> curFile;
  471. unsigned curIndex;
  472. IArrayOf<ArchiveEntry> entries; // The entries that matched
  473. bool includeDirs;
  474. bool sub;
  475. };
  476. IDirectoryIterator *createArchiveDirectoryIterator(const char *gitFileName, const char *mask, bool sub, bool includeDirs)
  477. {
  478. assertex(sub==false); // I don't know what it means!
  479. return new ArchiveDirectoryIterator(gitFileName, mask, sub, includeDirs);
  480. }
  481. class CArchiveFileHook : public CInterface, implements IContainedFileHook
  482. {
  483. public:
  484. IMPLEMENT_IINTERFACE;
  485. virtual IFile * createIFile(const char *fileName)
  486. {
  487. if (isArchiveFileName(fileName))
  488. return createIFileInArchive(fileName);
  489. else
  490. return NULL;
  491. }
  492. protected:
  493. static bool isArchiveFileName(const char *fileName)
  494. {
  495. if (fileName)
  496. return splitName(fileName) != NULL;
  497. return false;
  498. }
  499. } *archiveFileHook;
  500. extern ARCHIVEFILE_API void installFileHook()
  501. {
  502. SpinBlock b(*lock); // Probably overkill!
  503. if (!archiveFileHook)
  504. {
  505. archiveFileHook = new CArchiveFileHook;
  506. addContainedFileHook(archiveFileHook);
  507. }
  508. }
  509. extern ARCHIVEFILE_API void removeFileHook()
  510. {
  511. if (lock)
  512. {
  513. SpinBlock b(*lock); // Probably overkill!
  514. if (archiveFileHook)
  515. {
  516. removeContainedFileHook(archiveFileHook);
  517. archiveFileHook = NULL;
  518. }
  519. }
  520. }
  521. MODULE_INIT(INIT_PRIORITY_STANDARD)
  522. {
  523. lock = new SpinLock;
  524. signature = new RegExpr(ARCHIVE_SIGNATURE);
  525. archiveFileHook = NULL;
  526. return true;
  527. }
  528. MODULE_EXIT()
  529. {
  530. if (archiveFileHook)
  531. {
  532. removeContainedFileHook(archiveFileHook);
  533. archiveFileHook = NULL;
  534. }
  535. delete signature;
  536. delete lock;
  537. lock = NULL;
  538. ::Release(archiveFileHook);
  539. }