archive.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #include "platform.h"
  15. #include "jlib.hpp"
  16. #include "jio.hpp"
  17. #include "jmutex.hpp"
  18. #include "jfile.hpp"
  19. #include "jlog.hpp"
  20. #include "jregexp.hpp"
  21. #include "gitfile.hpp"
  22. #include "archive.hpp"
  23. #ifdef _USE_LIBARCHIVE
  24. #include <sys/stat.h>
  25. #include <archive.h>
  26. #include <archive_entry.h>
  27. /*
  28. * Direct access to files in zip archives (and other libarchive-supported formats), without needing to extract them first
  29. * Installs hooks into createIFile, spotting filenames of the form /my/directory/myfile.zip/{password}/path/within/archive
  30. */
  31. #define ARCHIVE_SIGNATURE "[.]{zip|tar|tar[.]gz|tgz}{$|"PATHSEPSTR"}"
  32. static RegExpr *signature;
  33. static SpinLock *lock;
  34. static const char *splitName(const char *fileName)
  35. {
  36. if (!fileName)
  37. return NULL;
  38. SpinBlock b(*lock);
  39. const char *sig = signature->find(fileName);
  40. if (sig)
  41. return sig+signature->findlen();
  42. else
  43. return NULL;
  44. }
  45. static void splitArchivedFileName(const char *fullName, StringAttr &container, StringAttr &option, StringAttr &relPath)
  46. {
  47. const char *tail = splitName(fullName);
  48. assertex(tail);
  49. size_t containerLen = tail-fullName;
  50. if (fullName[containerLen-1]==PATHSEPCHAR)
  51. containerLen--;
  52. container.set(fullName, containerLen);
  53. if (*tail=='{')
  54. {
  55. tail++;
  56. const char *end = strchr(tail, '}');
  57. if (!end)
  58. throw MakeStringException(0, "Invalid archive-embedded filename - no matching } found");
  59. option.set(tail, end - tail);
  60. tail = end+1;
  61. if (*tail==PATHSEPCHAR)
  62. tail++;
  63. else if (*tail != 0)
  64. throw MakeStringException(0, "Invalid archive-embedded filename - " PATHSEPSTR " expected after }");
  65. }
  66. else
  67. option.clear();
  68. if (tail && *tail)
  69. {
  70. StringBuffer s(tail);
  71. s.replace(PATHSEPCHAR, '/');
  72. relPath.set(s);
  73. }
  74. else
  75. relPath.clear();
  76. }
  77. static StringBuffer & buildArchivedFileName(StringBuffer &fullname, const char *archiveFile, const char *option, const char *relPath)
  78. {
  79. fullname.append(archiveFile);
  80. if (option && *option)
  81. fullname.append(PATHSEPCHAR).append('{').append(option).append('}');
  82. if (relPath && *relPath)
  83. fullname.append(PATHSEPCHAR).append(relPath);
  84. return fullname;
  85. }
  86. IDirectoryIterator *createArchiveDirectoryIterator(const char *gitFileName, const char *mask, bool sub, bool includeDirs);
  87. // Wrapper around libarchive's archive_entry struct to ensure we free them at right time
  88. // Because not clear whether safe to use a struct archive_entry object after the archive has been closed,
  89. // we copy the info we need out of them into something we CAN be sure of the lifespan of
  90. class ArchiveEntry : public CInterface, implements IInterface
  91. {
  92. public:
  93. IMPLEMENT_IINTERFACE;
  94. ArchiveEntry(struct archive_entry *entry)
  95. {
  96. mode = archive_entry_filetype(entry);
  97. filesize = archive_entry_size(entry);
  98. path.set(archive_entry_pathname(entry));
  99. }
  100. bool isDir() const
  101. {
  102. return S_ISDIR(mode);
  103. }
  104. inline offset_t size()
  105. {
  106. return filesize;
  107. }
  108. const char *pathname()
  109. {
  110. return path.get();
  111. }
  112. private:
  113. unsigned mode;
  114. offset_t filesize;
  115. StringAttr path;
  116. };
  117. // IFileIO implementation for reading out of libarchive-supported archives
  118. // Because of the nature of the libarchive this may not be efficient for some archive formats
  119. // Have to read through the entire archive directory to find the bit you want, it seems
  120. // It's possible that we could add some seek support to at least avoid having to do so twice?
  121. class ArchiveFileIO : public CInterface, implements IFileIO
  122. {
  123. public:
  124. IMPLEMENT_IINTERFACE;
  125. ArchiveFileIO(const char *_fullName) : fullName(_fullName)
  126. {
  127. // Sadly it seems we can't use a saved entry to read data from an archive. We have to open a new archive
  128. // object and scan through until we find the matching file, in order to extract it.
  129. StringAttr container, option, relpath;
  130. splitArchivedFileName(_fullName, container, option, relpath);
  131. curPos = 0;
  132. lastPos = 0;
  133. curBuffSize = 0;
  134. curBuff = NULL;
  135. archive = archive_read_new();
  136. archive_read_support_format_all(archive);
  137. archive_read_support_compression_all(archive);
  138. int retcode = archive_read_open_filename(archive, container, 10240);
  139. if (retcode == ARCHIVE_OK)
  140. {
  141. struct archive_entry *entry = archive_entry_new();
  142. while (archive_read_next_header2(archive, entry) == ARCHIVE_OK)
  143. {
  144. const char *filename = archive_entry_pathname(entry);
  145. if (strcmp(filename, relpath.get())==0)
  146. {
  147. fileSize = archive_entry_size(entry);
  148. break;
  149. }
  150. }
  151. archive_entry_free(entry);
  152. }
  153. }
  154. ~ArchiveFileIO()
  155. {
  156. archive_read_finish(archive);
  157. }
  158. virtual size32_t read(offset_t pos, size32_t len, void * _data)
  159. {
  160. // NOTE - we don't support multithreaded access (the sequential-only restriction would make that tricky anyway)
  161. if (pos < lastPos)
  162. throw MakeStringException(0, "Only sequential access to contained file %s supported", fullName.get());
  163. byte *data = (byte *) _data;
  164. lastPos = pos;
  165. size32_t lenRequested = len;
  166. while (len > 0 & pos < fileSize)
  167. {
  168. if (pos >= curPos+curBuffSize)
  169. {
  170. int ret = archive_read_data_block(archive, &curBuff, &curBuffSize, &curPos);
  171. if (ret != ARCHIVE_OK)
  172. {
  173. if (ret == ARCHIVE_EOF)
  174. break; // This shouldn't happen if the quoted fileSize was accurate...
  175. else
  176. throw MakeStringException(0, "Read error reading contained file %s", fullName.get());
  177. }
  178. }
  179. else
  180. {
  181. // Copy as much of the current request as we can fulfil from this block
  182. offset_t buffOffset = (pos - curPos);
  183. size_t copyLen = (curBuffSize - buffOffset) > len ? len : curBuffSize - buffOffset; // careful for overflows, we are mixing 64/32bit values
  184. if (curBuff)
  185. memcpy(data, ((const byte *) curBuff) + buffOffset, copyLen);
  186. else
  187. memset(data, 0, copyLen); // Sparse areas of compressed files may be represented with NULL buffers
  188. data += copyLen;
  189. len -= copyLen;
  190. pos += copyLen;
  191. }
  192. }
  193. return lenRequested - len;
  194. }
  195. virtual offset_t size()
  196. {
  197. return fileSize;
  198. }
  199. virtual void close()
  200. {
  201. }
  202. // Write methods not implemented - this is a read-only file
  203. virtual size32_t write(offset_t pos, size32_t len, const void * data)
  204. {
  205. throwUnexpected();
  206. }
  207. virtual offset_t appendFile(IFile *file,offset_t pos=0,offset_t len=(offset_t)-1)
  208. {
  209. throwUnexpected();
  210. }
  211. virtual void setSize(offset_t size)
  212. {
  213. throwUnexpected();
  214. }
  215. virtual void flush()
  216. {
  217. throwUnexpected();
  218. }
  219. protected:
  220. struct archive *archive;
  221. offset_t fileSize;
  222. #if ARCHIVE_VERSION_NUMBER < 3000000
  223. off_t curPos;
  224. #else
  225. unsigned __int64 curPos;
  226. #endif
  227. offset_t lastPos;
  228. size_t curBuffSize;
  229. const void *curBuff;
  230. StringAttr fullName;
  231. };
  232. // IFile implementation for reading out of libarchive-supported archives
  233. // These use the struct_archive_entry objects allocated in the directory iterator
  234. // in the hope they might be useful for directly seeking to the file to be extracted
  235. // at some point.
  236. class ArchiveFile : public CInterface, implements IFile
  237. {
  238. public:
  239. IMPLEMENT_IINTERFACE;
  240. ArchiveFile(const char *_fileName, ArchiveEntry *_entry)
  241. : fullName(_fileName),entry(_entry)
  242. {
  243. }
  244. virtual bool exists()
  245. {
  246. return entry != NULL;
  247. }
  248. virtual bool getTime(CDateTime * createTime, CDateTime * modifiedTime, CDateTime * accessedTime)
  249. {
  250. UNIMPLEMENTED; // MORE - maybe could implement if required
  251. }
  252. virtual fileBool isDirectory()
  253. {
  254. if (!entry)
  255. return notFound;
  256. return entry->isDir() ? foundYes : foundNo;
  257. }
  258. virtual fileBool isFile()
  259. {
  260. if (!entry)
  261. return notFound;
  262. return entry->isDir() ? foundNo : foundYes;
  263. }
  264. virtual fileBool isReadOnly()
  265. {
  266. if (!entry)
  267. return notFound;
  268. return foundYes;
  269. }
  270. virtual IFileIO * open(IFOmode mode)
  271. {
  272. assertex(mode==IFOread && entry != NULL);
  273. return new ArchiveFileIO(fullName.str());
  274. }
  275. virtual IFileAsyncIO * openAsync(IFOmode mode)
  276. {
  277. UNIMPLEMENTED;
  278. }
  279. virtual IFileIO * openShared(IFOmode mode, IFSHmode shmode)
  280. {
  281. assertex(mode==IFOread && entry != NULL);
  282. return new ArchiveFileIO(fullName.str());
  283. }
  284. virtual const char * queryFilename()
  285. {
  286. return fullName.str();
  287. }
  288. virtual offset_t size()
  289. {
  290. if (!entry)
  291. return 0;
  292. return entry->size();
  293. }
  294. // Directory functions
  295. virtual IDirectoryIterator *directoryFiles(const char *mask, bool sub, bool includeDirs)
  296. {
  297. if (isDirectory() != foundYes || (mask && !*mask)) // Empty mask string means matches nothing - NULL means matches everything
  298. return createNullDirectoryIterator();
  299. else
  300. {
  301. StringBuffer dirName(fullName);
  302. dirName.append(PATHSEPCHAR);
  303. return createArchiveDirectoryIterator(dirName, mask, sub, includeDirs);
  304. }
  305. }
  306. virtual bool getInfo(bool &_isdir,offset_t &_size,CDateTime &_modtime)
  307. {
  308. _isdir = isDirectory()==foundYes;
  309. _size = size();
  310. _modtime.clear(); // MORE could probably do better
  311. return true; // MORE should this be false if not existing?
  312. }
  313. // Not going to be implemented - this IFile interface is too big..
  314. virtual bool setTime(const CDateTime * createTime, const CDateTime * modifiedTime, const CDateTime * accessedTime) { UNIMPLEMENTED; }
  315. virtual bool remove() { UNIMPLEMENTED; }
  316. virtual void rename(const char *newTail) { UNIMPLEMENTED; }
  317. virtual void move(const char *newName) { UNIMPLEMENTED; }
  318. virtual void setReadOnly(bool ro) { UNIMPLEMENTED; }
  319. virtual bool setCompression(bool set) { UNIMPLEMENTED; }
  320. virtual offset_t compressedSize() { UNIMPLEMENTED; }
  321. virtual unsigned getCRC() { UNIMPLEMENTED; }
  322. virtual void setCreateFlags(unsigned cflags) { UNIMPLEMENTED; }
  323. virtual void setShareMode(IFSHmode shmode) { UNIMPLEMENTED; }
  324. virtual bool createDirectory() { UNIMPLEMENTED; }
  325. virtual IDirectoryDifferenceIterator *monitorDirectory(
  326. IDirectoryIterator *prev=NULL, // in (NULL means use current as baseline)
  327. const char *mask=NULL,
  328. bool sub=false,
  329. bool includedirs=false,
  330. unsigned checkinterval=60*1000,
  331. unsigned timeout=(unsigned)-1,
  332. Semaphore *abortsem=NULL) { UNIMPLEMENTED; }
  333. virtual void copySection(const RemoteFilename &dest, offset_t toOfs=(offset_t)-1, offset_t fromOfs=0, offset_t size=(offset_t)-1, ICopyFileProgress *progress=NULL) { UNIMPLEMENTED; }
  334. virtual void copyTo(IFile *dest, size32_t buffersize=0x100000, ICopyFileProgress *progress=NULL, bool usetmp=false) { UNIMPLEMENTED; }
  335. virtual IMemoryMappedFile *openMemoryMapped(offset_t ofs=0, memsize_t len=(memsize_t)-1, bool write=false) { UNIMPLEMENTED; }
  336. virtual void treeCopyTo(IFile *dest,IpSubNet &subnet,IpAddress &resfrom,bool usetmp=false) { UNIMPLEMENTED; }
  337. protected:
  338. StringBuffer fullName;
  339. Linked<ArchiveEntry> entry;
  340. };
  341. extern REMOTE_API IFile *createIFileInArchive(const char *containedFileName)
  342. {
  343. StringBuffer fname(containedFileName);
  344. assertex(fname.length());
  345. removeTrailingPathSepChar(fname);
  346. StringBuffer dirPath, dirTail;
  347. splitFilename(fname.str(), &dirPath, &dirPath, &dirTail, &dirTail);
  348. Owned<IDirectoryIterator> dir = createArchiveDirectoryIterator(dirPath.str(), dirTail.str(), false, true);
  349. if (dir->first())
  350. {
  351. Linked<IFile> file = &dir->query();
  352. assertex(!dir->next());
  353. return file.getClear();
  354. }
  355. else
  356. return new ArchiveFile(containedFileName, NULL);
  357. }
  358. class ArchiveDirectoryIterator : public CInterface, implements IDirectoryIterator
  359. {
  360. public:
  361. IMPLEMENT_IINTERFACE;
  362. ArchiveDirectoryIterator(const char *_containedFileName, const char *_mask, bool _sub, bool _includeDirs)
  363. : mask(_mask), sub(_sub), includeDirs(_includeDirs)
  364. {
  365. splitArchivedFileName(_containedFileName, container, option, relDir);
  366. curIndex = 0;
  367. }
  368. virtual StringBuffer &getName(StringBuffer &buf)
  369. {
  370. assertex(curFile);
  371. return buf.append(curFile->queryFilename());
  372. }
  373. virtual bool isDir()
  374. {
  375. assertex(curFile);
  376. return curFile->isDirectory();
  377. }
  378. virtual __int64 getFileSize()
  379. {
  380. assertex(curFile);
  381. return curFile->size();
  382. }
  383. virtual bool getModifiedTime(CDateTime &ret)
  384. {
  385. UNIMPLEMENTED;
  386. }
  387. virtual bool first()
  388. {
  389. curFile.clear();
  390. entries.kill();
  391. curIndex = 0;
  392. struct archive *archive = archive_read_new();
  393. archive_read_support_format_all(archive);
  394. archive_read_support_compression_all(archive);
  395. int retcode = archive_read_open_filename(archive, container, 10240);
  396. if (retcode == ARCHIVE_OK)
  397. {
  398. struct archive_entry *entry = archive_entry_new();
  399. while (archive_read_next_header2(archive, entry) == ARCHIVE_OK)
  400. {
  401. unsigned mode = archive_entry_filetype(entry);
  402. bool isDir = S_ISDIR(mode);
  403. if (includeDirs || !isDir)
  404. {
  405. const char *filename = archive_entry_pathname(entry);
  406. if (memcmp(filename, relDir.get(), relDir.length())==0)
  407. {
  408. StringBuffer tail(filename + relDir.length());
  409. if (tail.length())
  410. {
  411. if (tail.charAt(tail.length()-1)=='/' || tail.charAt(tail.length()-1)==PATHSEPCHAR)
  412. tail.remove(tail.length()-1, 1);
  413. }
  414. else
  415. {
  416. assert(isDir);
  417. tail.append(".");
  418. }
  419. // Strip off a trailing /, then check that there is no / in the tail
  420. if (strchr(tail, PATHSEPCHAR) == NULL && (!mask.length() || WildMatch(tail, mask, false)))
  421. {
  422. DBGLOG("found file %s %s %s", container.get(), relDir.get(), tail.str());
  423. entries.append(*new ArchiveEntry(entry));
  424. }
  425. }
  426. }
  427. }
  428. archive_entry_free(entry);
  429. }
  430. archive_read_finish(archive);
  431. return next();
  432. }
  433. virtual bool next()
  434. {
  435. if (entries.isItem(curIndex))
  436. {
  437. ArchiveEntry &entry = entries.item(curIndex);
  438. curIndex++;
  439. const char *filename = entry.pathname();
  440. StringBuffer containedFileName;
  441. buildArchivedFileName(containedFileName, container, option, filename);
  442. removeTrailingPathSepChar(containedFileName);
  443. curFile.setown(new ArchiveFile(containedFileName, &entry));
  444. return true;
  445. }
  446. else
  447. {
  448. curFile.clear();
  449. return false;
  450. }
  451. }
  452. virtual bool isValid() { return curFile != NULL; }
  453. virtual IFile & query() { return *curFile; }
  454. protected:
  455. StringAttr container;
  456. StringAttr option;
  457. StringAttr relDir;
  458. StringAttr mask;
  459. Owned<IFile> curFile;
  460. unsigned curIndex;
  461. IArrayOf<ArchiveEntry> entries; // The entries that matched
  462. bool includeDirs;
  463. bool sub;
  464. };
  465. IDirectoryIterator *createArchiveDirectoryIterator(const char *gitFileName, const char *mask, bool sub, bool includeDirs)
  466. {
  467. assertex(sub==false); // I don't know what it means!
  468. return new ArchiveDirectoryIterator(gitFileName, mask, sub, includeDirs);
  469. }
  470. class CArchiveFileHook : public CInterface, implements IContainedFileHook
  471. {
  472. public:
  473. IMPLEMENT_IINTERFACE;
  474. virtual IFile * createIFile(const char *fileName)
  475. {
  476. if (isArchiveFileName(fileName))
  477. return createIFileInArchive(fileName);
  478. else
  479. return NULL;
  480. }
  481. protected:
  482. static bool isArchiveFileName(const char *fileName)
  483. {
  484. if (fileName)
  485. return splitName(fileName) != NULL;
  486. return false;
  487. }
  488. } *archiveFileHook;
  489. extern REMOTE_API void installArchiveFileHook()
  490. {
  491. SpinBlock b(*lock); // Probably overkill!
  492. if (!archiveFileHook)
  493. {
  494. archiveFileHook = new CArchiveFileHook;
  495. addContainedFileHook(archiveFileHook);
  496. }
  497. }
  498. extern REMOTE_API void removeArchiveFileHook()
  499. {
  500. SpinBlock b(*lock); // Probably overkill!
  501. if (archiveFileHook)
  502. {
  503. removeContainedFileHook(archiveFileHook);
  504. archiveFileHook = NULL;
  505. }
  506. }
  507. MODULE_INIT(INIT_PRIORITY_REMOTE_RMTFILE)
  508. {
  509. lock = new SpinLock;
  510. signature = new RegExpr(ARCHIVE_SIGNATURE);
  511. archiveFileHook = NULL;
  512. return true;
  513. }
  514. MODULE_EXIT()
  515. {
  516. removeArchiveFileHook();
  517. delete signature;
  518. delete lock;
  519. ::Release(archiveFileHook);
  520. }
  521. #else
  522. extern REMOTE_API void installArchiveFileHook()
  523. {
  524. }
  525. extern REMOTE_API void removeArchiveFileHook()
  526. {
  527. }
  528. extern REMOTE_API IFile *createIFileInArchive(const char *containedFileName)
  529. {
  530. throw MakeStringException(0, "System was built without archive file support");
  531. }
  532. #endif