chunk - Break A Large XML File Into Manageable Chunks


/ Published in: PHP
Save to your folder(s)

I’ve recently had to parse some pretty large XML documents, and needed a method to read one element at a time.

Here’s a fairly simple solution in PHP.


Copy this code and paste it in your HTML
  1. <?php
  2. /**
  3.  * Chunk
  4.  *
  5.  * Reads a large file in as chunks for easier parsing.
  6.  *
  7.  * The chunks returned are whole <$this->options['element']/>s found within file.
  8.  *
  9.  * Each call to read() returns the whole element including start and end tags.
  10.  *
  11.  * Tested with a 1.8MB file, extracted 500 elements in 0.11s
  12.  * (with no work done, just extracting the elements)
  13.  *
  14.  * Usage:
  15.  * <code>
  16.  * // initialize the object
  17.  * $file = new Chunk('chunk-test.xml', array('element' => 'Chunk'));
  18.  *
  19.  * // loop through the file until all lines are read
  20.  * while ($xml = $file->read()) {
  21.  * // do whatever you want with the string
  22.  * $o = simplexml_load_string($xml);
  23.  * }
  24.  * </code>
  25.  *
  26.  * @package default
  27.  * @author Dom Hastings
  28.  */
  29. class Chunk {
  30. /**
  31.   * options
  32.   *
  33.   * @var array Contains all major options
  34.   * @access public
  35.   */
  36. public $options = array(
  37. 'path' => './', // string The path to check for $file in
  38. 'element' => '', // string The XML element to return
  39. 'chunkSize' => 512 // integer The amount of bytes to retrieve in each chunk
  40. );
  41.  
  42. /**
  43.   * file
  44.   *
  45.   * @var string The filename being read
  46.   * @access public
  47.   */
  48. public $file = '';
  49. /**
  50.   * pointer
  51.   *
  52.   * @var integer The current position the file is being read from
  53.   * @access public
  54.   */
  55. public $pointer = 0;
  56.  
  57. /**
  58.   * handle
  59.   *
  60.   * @var resource The fopen() resource
  61.   * @access private
  62.   */
  63. private $handle = null;
  64. /**
  65.   * reading
  66.   *
  67.   * @var boolean Whether the script is currently reading the file
  68.   * @access private
  69.   */
  70. private $reading = false;
  71. /**
  72.   * readBuffer
  73.   *
  74.   * @var string Used to make sure start tags aren't missed
  75.   * @access private
  76.   */
  77. private $readBuffer = '';
  78.  
  79. /**
  80.   * __construct
  81.   *
  82.   * Builds the Chunk object
  83.   *
  84.   * @param string $file The filename to work with
  85.   * @param array $options The options with which to parse the file
  86.   * @author Dom Hastings
  87.   * @access public
  88.   */
  89. public function __construct($file, $options = array()) {
  90. // merge the options together
  91. $this->options = array_merge($this->options, (is_array($options) ? $options : array()));
  92.  
  93. // check that the path ends with a /
  94. if (substr($this->options['path'], -1) != '/') {
  95. $this->options['path'] .= '/';
  96. }
  97.  
  98. // normalize the filename
  99. $file = basename($file);
  100.  
  101. // make sure chunkSize is an int
  102. $this->options['chunkSize'] = intval($this->options['chunkSize']);
  103.  
  104. // check it's valid
  105. if ($this->options['chunkSize'] < 64) {
  106. $this->options['chunkSize'] = 512;
  107. }
  108.  
  109. // set the filename
  110. $this->file = realpath($this->options['path'].$file);
  111.  
  112. // check the file exists
  113. if (!file_exists($this->file)) {
  114. throw new Exception('Cannot load file: '.$this->file);
  115. }
  116.  
  117. // open the file
  118. $this->handle = fopen($this->file, 'r');
  119.  
  120. // check the file opened successfully
  121. if (!$this->handle) {
  122. throw new Exception('Error opening file for reading');
  123. }
  124. }
  125.  
  126. /**
  127.   * __destruct
  128.   *
  129.   * Cleans up
  130.   *
  131.   * @return void
  132.   * @author Dom Hastings
  133.   * @access public
  134.   */
  135. public function __destruct() {
  136. // close the file resource
  137. fclose($this->handle);
  138. }
  139.  
  140. /**
  141.   * read
  142.   *
  143.   * Reads the first available occurence of the XML element $this->options['element']
  144.   *
  145.   * @return string The XML string from $this->file
  146.   * @author Dom Hastings
  147.   * @access public
  148.   */
  149. public function read() {
  150. // check we have an element specified
  151. if (!empty($this->options['element'])) {
  152. // trim it
  153. $element = trim($this->options['element']);
  154.  
  155. } else {
  156. $element = '';
  157. }
  158.  
  159. // initialize the buffer
  160. $buffer = false;
  161.  
  162. // if the element is empty
  163. if (empty($element)) {
  164. // let the script know we're reading
  165. $this->reading = true;
  166.  
  167. // read in the whole doc, cos we don't know what's wanted
  168. while ($this->reading) {
  169. $buffer .= fread($this->handle, $this->options['chunkSize']);
  170.  
  171. $this->reading = (!feof($this->handle));
  172. }
  173.  
  174. // return it all
  175. return $buffer;
  176.  
  177. // we must be looking for a specific element
  178. } else {
  179. // set up the strings to find
  180. $open = '<'.$element.'>';
  181. $close = '</'.$element.'>';
  182.  
  183. // let the script know we're reading
  184. $this->reading = true;
  185.  
  186. // reset the global buffer
  187. $this->readBuffer = '';
  188.  
  189. // this is used to ensure all data is read, and to make sure we don't send the start data again by mistake
  190. $store = false;
  191.  
  192. // seek to the position we need in the file
  193. fseek($this->handle, $this->pointer);
  194.  
  195. // start reading
  196. while ($this->reading && !feof($this->handle)) {
  197. // store the chunk in a temporary variable
  198. $tmp = fread($this->handle, $this->options['chunkSize']);
  199.  
  200. // update the global buffer
  201. $this->readBuffer .= $tmp;
  202.  
  203. // check for the open string
  204. $checkOpen = strpos($tmp, $open);
  205.  
  206. // if it wasn't in the new buffer
  207. if (!$checkOpen && !($store)) {
  208. // check the full buffer (in case it was only half in this buffer)
  209. $checkOpen = strpos($this->readBuffer, $open);
  210.  
  211. // if it was in there
  212. if ($checkOpen) {
  213. // set it to the remainder
  214. $checkOpen = $checkOpen % $this->options['chunkSize'];
  215. }
  216. }
  217.  
  218. // check for the close string
  219. $checkClose = strpos($tmp, $close);
  220.  
  221. // if it wasn't in the new buffer
  222. if (!$checkClose && ($store)) {
  223. // check the full buffer (in case it was only half in this buffer)
  224. $checkClose = strpos($this->readBuffer, $close);
  225.  
  226. // if it was in there
  227. if ($checkClose) {
  228. // set it to the remainder plus the length of the close string itself
  229. $checkClose = ($checkClose + strlen($close)) % $this->options['chunkSize'];
  230. }
  231.  
  232. // if it was
  233. } elseif ($checkClose) {
  234. // add the length of the close string itself
  235. $checkClose += strlen($close);
  236. }
  237.  
  238. // if we've found the opening string and we're not already reading another element
  239. if ($checkOpen !== false && !($store)) {
  240. // if we're found the end element too
  241. if ($checkClose !== false) {
  242. // append the string only between the start and end element
  243. $buffer .= substr($tmp, $checkOpen, ($checkClose - $checkOpen));
  244.  
  245. // update the pointer
  246. $this->pointer += $checkClose;
  247.  
  248. // let the script know we're done
  249. $this->reading = false;
  250.  
  251. } else {
  252. // append the data we know to be part of this element
  253. $buffer .= substr($tmp, $checkOpen);
  254.  
  255. // update the pointer
  256. $this->pointer += $this->options['chunkSize'];
  257.  
  258. // let the script know we're gonna be storing all the data until we find the close element
  259. $store = true;
  260. }
  261.  
  262. // if we've found the closing element
  263. } elseif ($checkClose !== false) {
  264. // update the buffer with the data upto and including the close tag
  265. $buffer .= substr($tmp, 0, $checkClose);
  266.  
  267. // update the pointer
  268. $this->pointer += $checkClose;
  269.  
  270. // let the script know we're done
  271. $this->reading = false;
  272.  
  273. // if we've found the closing element, but half in the previous chunk
  274. } elseif ($store) {
  275. // update the buffer
  276. $buffer .= $tmp;
  277.  
  278. // and the pointer
  279. $this->pointer += $this->options['chunkSize'];
  280. }
  281. }
  282. }
  283.  
  284. // return the element (or the whole file if we're not looking for elements)
  285. return $buffer;
  286. }
  287. }

URL: http://www.dom111.co.uk/blog/coding/chunk-read-a-large-xml-file-a-chunk-at-a-time/99

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.