IndexMetadata.pm 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems(R).
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. package IndexMetadata;
  14. use strict;
  15. use warnings;
  16. use Fcntl qw(SEEK_SET);
  17. use File::stat;
  18. use Math::BigInt;
  19. use Exporter;
  20. our @ISA = qw(Exporter);
  21. # private functions
  22. sub _unpackint($ )
  23. {
  24. my ($data) = @_;
  25. my @bytes = split(//, $data);
  26. my $val = new Math::BigInt('0');
  27. $val = ($val<<8)+ord($_) foreach (@bytes);
  28. return $val;
  29. }
  30. sub _packint($$ )
  31. {
  32. my ($val, $len) = @_;
  33. my @bytes;
  34. for (1..$len)
  35. {
  36. unshift(@bytes, chr($val % 256));
  37. $val = $val>>8;
  38. }
  39. return join('', @bytes);
  40. }
  41. # private methods
  42. sub _readint($$$ )
  43. {
  44. my ($self, $offset, $len) = @_;
  45. seek($self->{fh}, $offset, SEEK_SET);
  46. my $data;
  47. my $got = read($self->{fh}, $data, $len);
  48. die("Corrupt or unreadable index (expected $len-byte integer at offset $offset, read $got)") unless($got == $len);
  49. return _unpackint($data);
  50. }
  51. my $blank = new Math::BigInt('0xFFFFFFFFFFFFFFFF');
  52. sub _walkpages($ )
  53. {
  54. my ($self) = @_;
  55. my $page = $self->_readint(0xF8, 8);
  56. return if($page == $blank);
  57. while($page)
  58. {
  59. push(@{$self->{pages}}, $page);
  60. $page = $self->_readint($page, 8);
  61. }
  62. }
  63. sub _readpage($$ )
  64. {
  65. my ($self, $offset) = @_;
  66. my $len = $self->_readint($offset+0x1A, 2);
  67. my $body;
  68. my $got = read($self->{fh}, $body, $len);
  69. die("Corrupt or unreadable index (page at offset $offset indicated length $len, read $got)") unless($got == $len);
  70. return $body;
  71. }
  72. sub _resetpages($ )
  73. {
  74. my ($self) = @_;
  75. my @oldpages = sort({$b <=> $a} @{$self->{pages}}); # @oldpages is in reverse numerical order
  76. $self->{pages} = [];
  77. return @oldpages;
  78. }
  79. sub _nextpage($$ )
  80. {
  81. my ($self, $oldpages) = @_;
  82. my $nextpage = pop(@$oldpages); # @$oldpages is in reverse numerical order, so use pop to fill from start
  83. unless($nextpage)
  84. {
  85. $nextpage = $self->{size};
  86. $self->{size} += 8192;
  87. }
  88. push(@{$self->{pages}}, $nextpage);
  89. return $nextpage;
  90. }
  91. sub _writepage($$$$$$ )
  92. {
  93. my ($self, $xml, $prevpage, $page, $more, $oldpages) = @_;
  94. my @pages = @{$self->{pages}};
  95. my $nextpage = $more ? $self->_nextpage($oldpages) : 0;
  96. my $len = length($xml);
  97. my $fh = $self->{fh};
  98. seek($fh, $page, SEEK_SET);
  99. print($fh _packint($nextpage, 8));
  100. print($fh _packint($prevpage, 8));
  101. print($fh _packint(3, 10));
  102. print($fh _packint($len, 2));
  103. $xml .= (chr(0) x (8164-$len));
  104. print($fh $xml);
  105. return $nextpage;
  106. }
  107. sub _fixhead($ )
  108. {
  109. my ($self) = @_;
  110. my $fh = $self->{fh};
  111. my $page = @{$self->{pages}} ? $self->{pages}->[0] : 0;
  112. seek($fh, 0xF8, SEEK_SET);
  113. print($fh _packint($page, 8));
  114. }
  115. sub _clearpages($@ )
  116. {
  117. my ($self, $oldpages) = @_;
  118. return unless(@$oldpages);
  119. my $newsize = $self->{size};
  120. foreach my $page (@$oldpages) # @$oldpages is in reverse numerical order, so use foreach to remove unused from end
  121. {
  122. if($page = $newsize - 8192)
  123. {
  124. $newsize = $page;
  125. }
  126. else
  127. {
  128. my $fh = $self->{fh};
  129. seek($fh, $page, SEEK_SET);
  130. print($fh (chr(0) x 8192));
  131. warn("Metadata shortened or removed but unused page at $page is not at end of file, blanking instead");
  132. }
  133. }
  134. if($newsize < $self->{size})
  135. {
  136. truncate($self->{fh}, $newsize) or die("Could not trunctate filehandle to $newsize bytes");
  137. $self->{size} = $newsize;
  138. }
  139. }
  140. # public methods
  141. sub new($$ )
  142. {
  143. my ($class, $fh) = @_;
  144. my $self = {fh => $fh, size => 0, pages => []};
  145. bless($self, $class);
  146. my $stat = stat($fh) or die("Could not stat filehandle");
  147. $self->{size} = $stat->size;
  148. $self->_walkpages();
  149. return $self;
  150. }
  151. sub read($ )
  152. {
  153. my ($self) = @_;
  154. return undef unless(@{$self->{pages}});
  155. my $xml;
  156. $xml .= $self->_readpage($_) foreach (@{$self->{pages}});
  157. return $xml;
  158. }
  159. sub rewrite($$ )
  160. {
  161. my ($self, $xml) = @_;
  162. die("Attempt to write empty metadata (use strip to remove)") unless($xml);
  163. my @oldpages = $self->_resetpages();
  164. my $written = 0;
  165. my $len = length($xml);
  166. my $prevpage = 0;
  167. my $page = $self->_nextpage(\@oldpages);
  168. while($page)
  169. {
  170. my $chunk = substr($xml, $written, 8164);
  171. $written += length($chunk);
  172. my $nextpage = $self->_writepage($chunk, $prevpage, $page, ($written < $len), \@oldpages);
  173. $prevpage = $page;
  174. $page = $nextpage;
  175. }
  176. $self->_fixhead();
  177. $self->_clearpages(\@oldpages);
  178. }
  179. sub strip($ )
  180. {
  181. my ($self) = @_;
  182. my @oldpages = $self->_resetpages();
  183. $self->_fixhead();
  184. $self->_clearpages(\@oldpages);
  185. }
  186. # public functions
  187. sub readfile($ )
  188. {
  189. my ($filename) = @_;
  190. open(FH, '<:bytes', $filename) or die("could not open $filename for reading: $!");
  191. my $meta = IndexMetadata->new(*FH);
  192. my $xml = $meta->read();
  193. close(FH);
  194. return $xml;
  195. }
  196. sub stripfile($ )
  197. {
  198. my ($filename) = @_;
  199. open(FH, '+<:bytes', $filename) or die("could not open $filename for rewriting: $!");
  200. my $meta = IndexMetadata->new(*FH);
  201. $meta->strip();
  202. close(FH);
  203. }
  204. 1;
  205. __END__
  206. =head1 NAME
  207. IndexMetadata - perl module to read, rewrite, or strip the metadata from physical index parts
  208. =head1 SYNOPSIS
  209. To read and print metadata:
  210. open(FH, '<:bytes', 'myindex') or die("could not open myindex for reading: $!");
  211. my $meta = IndexMetadata->new(*FH);
  212. my $xml = $meta->read();
  213. close(FH);
  214. print($xml);
  215. ...or simply:
  216. print(IndexMetadata::readfile('myindex'));
  217. To modify metadata:
  218. open(FH, '+<:bytes', 'myindex') or die("could not open myindex for rewriting: $!");
  219. my $meta = IndexMetadata->new(*FH);
  220. my $xml = $meta->read();
  221. # ...
  222. $meta->rewrite($new_xml);
  223. close(FH);
  224. To strip out metadata:
  225. open(FH, '+<:bytes', 'myindex') or die("could not open myindex for rewriting: $!");
  226. my $meta = IndexMetadata->new(*FH);
  227. $meta->strip();
  228. close(FH);
  229. ...or simply:
  230. IndexMetadata::stripfile('myindex');
  231. =head1 DESCRIPTION
  232. =head2 METHODS
  233. =over
  234. =item C<$meta = IndexMetadata-E<gt>new($fh)>
  235. Returns a new metadata handler object for the index at the given filehandle reference. It is an error if it is not possible to stat and read the filehandle, or if it does not point to a valid index.
  236. =item C<$xml = $meta-E<gt>read()>
  237. Returns the index metadata, as a string. It is expected that this string should parse as well-formed XML with a document root named C<metadata>. Returns C<undef> if the index has no metadata. It is an error the index is corrupt or unreadable.
  238. =item C<$meta-E<gt>rewrite($xml)>
  239. Rewrites the index metadata. It is an error if the metadata is empty, or if the index is unwritable, or untruncatable where required (see below). The metadata is taken as a string. B<IMPORTANT>: It is the caller's responsibility to ensure that this string parses as well-formed XML with a document root named C<metadata>.
  240. =item C<$meta-E<gt>strip()>
  241. Strips all metadata from the index. It is an error if the index is unwritable, or untruncatable where required (see below).
  242. =back
  243. =head2 HELPER FUNCTIONS
  244. =over
  245. =item C<$xml = IndexMetadata::readfile($filename)>
  246. Returns the metadata from the named file.
  247. =item C<IndexMetadata::stripfile($filename)>
  248. Strips the metadata from the named file.
  249. =back
  250. =head2 NOTE ON TRUNCATION
  251. If an index contains metadata which is replaced with a string sufficiently shorter using C<rewrite> (specifically one which divides into less 8164 byte chunks) or removed using C<strip> then 8192 byte pages will be removed from the file. Normally, these unused metadata pages will be at the end of the index, and the filehandle will simply be truncated. If for some reason the pages do not fall at the end, they will be blanked instead, as reordering non-metadata pages is not easily possible: warnings will be issued when this occurs.