Overview

Packages

  • Util

Classes

  • Horde_Array
  • Horde_Array_Sort_Helper
  • Horde_Domhtml
  • Horde_String
  • Horde_Util
  • Horde_Variables
  • Overview
  • Package
  • Class
  • Tree
  1: <?php
  2: /**
  3:  * Utility class to help in loading DOM data from HTML strings.
  4:  *
  5:  * Copyright 2010-2012 Horde LLC (http://www.horde.org/)
  6:  *
  7:  * @author   Michael Slusarz <slusarz@horde.org>
  8:  * @category Horde
  9:  * @package  Util
 10:  * @license  http://www.horde.org/licenses/lgpl21 LGPL 2.1
 11:  */
 12: class Horde_Domhtml
 13: {
 14:     /**
 15:      * DOM object.
 16:      *
 17:      * @var DOMDocument
 18:      */
 19:     public $dom;
 20: 
 21:     /**
 22:      * Original charset of data.
 23:      *
 24:      * @var string
 25:      */
 26:     protected $_origCharset;
 27: 
 28:     /**
 29:      * Encoding tag added to beginning of output.
 30:      *
 31:      * @var string
 32:      */
 33:     protected $_xmlencoding = '';
 34: 
 35:     /**
 36:      * Constructor.
 37:      *
 38:      * @param string $text     The text of the HTML document.
 39:      * @param string $charset  The charset of the HTML document.
 40:      *
 41:      * @throws Exception
 42:      */
 43:     public function __construct($text, $charset = null)
 44:     {
 45:         if (!extension_loaded('dom')) {
 46:             throw new Exception('DOM extension is not available.');
 47:         }
 48: 
 49:         // Bug #9616: Make sure we have valid HTML input.
 50:         if (!strlen($text)) {
 51:             $text = '<html></html>';
 52:         }
 53: 
 54:         $old_error = libxml_use_internal_errors(true);
 55:         $doc = new DOMDocument();
 56: 
 57:         if (is_null($charset)) {
 58:             /* If no charset given, charset is whatever libxml tells us the
 59:              * encoding should be defaulting to 'iso-8859-1'. */
 60:             $doc->loadHTML($text);
 61:             $this->_origCharset = $doc->encoding
 62:                 ? $doc->encoding
 63:                 : 'iso-8859-1';
 64:         } else {
 65:             /* Convert/try with UTF-8 first. */
 66:             $this->_origCharset = Horde_String::lower($charset);
 67:             $this->_xmlencoding = '<?xml encoding="UTF-8"?>';
 68:             $doc->loadHTML($this->_xmlencoding . Horde_String::convertCharset($text, $charset, 'UTF-8'));
 69: 
 70:             if ($doc->encoding &&
 71:                 (Horde_String::lower($doc->encoding) != 'utf-8')) {
 72:                 /* Convert charset to what the HTML document says it SHOULD
 73:                  * be. */
 74:                 $doc->loadHTML(Horde_String::convertCharset($text, $charset, $doc->encoding));
 75:                 $this->_xmlencoding = '';
 76:             }
 77:         }
 78: 
 79:         if ($old_error) {
 80:             libxml_use_internal_errors(false);
 81:         }
 82: 
 83:         $this->dom = $doc;
 84:     }
 85: 
 86:     /**
 87:      * Returns the HEAD element, or creates one if it doesn't exist.
 88:      *
 89:      * @return DOMElement  HEAD element.
 90:      */
 91:     public function getHead()
 92:     {
 93:         $head = $this->dom->getElementsByTagName('head');
 94:         if ($head->length) {
 95:             return $head->item(0);
 96:         }
 97: 
 98:         $headelt = $this->dom->createElement('head');
 99:         $this->dom->appendChild($headelt);
100: 
101:         return $headelt;
102:     }
103: 
104:     /**
105:      * Returns the full HTML text in the original charset.
106:      *
107:      * @return string  HTML text.
108:      */
109:     public function returnHtml()
110:     {
111:         $text = Horde_String::convertCharset($this->dom->saveHTML(), $this->dom->encoding || $this->_origCharset, $this->_origCharset);
112: 
113:         if (!$this->_xmlencoding ||
114:             (($pos = strpos($text, $this->_xmlencoding)) === false)) {
115:             return $text;
116:         }
117: 
118:         return substr_replace($text, '', $pos, strlen($this->_xmlencoding));
119:     }
120: 
121:     /**
122:      * Returns the body text in the original charset.
123:      *
124:      * @return string  HTML text.
125:      */
126:     public function returnBody()
127:     {
128:         $body = $this->dom->getElementsByTagName('body')->item(0);
129:         $text = '';
130: 
131:         if ($body && $body->hasChildNodes()) {
132:             foreach ($body->childNodes as $child) {
133:                 $text .= $this->dom->saveXML($child);
134:             }
135:         }
136: 
137:         return Horde_String::convertCharset($text, 'UTF-8', $this->_origCharset);
138:     }
139: 
140: }
141: 
API documentation generated by ApiGen