Overview

Packages

  • Mime
    • Viewer

Classes

  • Horde_Mime_Viewer
  • Horde_Mime_Viewer_Audio
  • Horde_Mime_Viewer_Base
  • Horde_Mime_Viewer_Deb
  • Horde_Mime_Viewer_Default
  • Horde_Mime_Viewer_Enriched
  • Horde_Mime_Viewer_Exception
  • Horde_Mime_Viewer_Html
  • Horde_Mime_Viewer_Images
  • Horde_Mime_Viewer_Msexcel
  • Horde_Mime_Viewer_Mspowerpoint
  • Horde_Mime_Viewer_Msword
  • Horde_Mime_Viewer_Ooo
  • Horde_Mime_Viewer_Pdf
  • Horde_Mime_Viewer_Plain
  • Horde_Mime_Viewer_Rar
  • Horde_Mime_Viewer_Report
  • Horde_Mime_Viewer_Rfc822
  • Horde_Mime_Viewer_Richtext
  • Horde_Mime_Viewer_Rpm
  • Horde_Mime_Viewer_Rtf
  • Horde_Mime_Viewer_Security
  • Horde_Mime_Viewer_Simple
  • Horde_Mime_Viewer_Smil
  • Horde_Mime_Viewer_Syntaxhighlighter
  • Horde_Mime_Viewer_Tgz
  • Horde_Mime_Viewer_Tnef
  • Horde_Mime_Viewer_Translation
  • Horde_Mime_Viewer_Wordperfect
  • Horde_Mime_Viewer_Zip
  • Overview
  • Package
  • Class
  • Tree
  1: <?php
  2: /**
  3:  * The Horde_Mime_Viewer_Html class renders out HTML text with an effort to
  4:  * remove potentially malicious code.
  5:  *
  6:  * Copyright 1999-2012 Horde LLC (http://www.horde.org/)
  7:  *
  8:  * See the enclosed file COPYING for license information (GPL). If you
  9:  * did not receive this file, see http://www.horde.org/licenses/gpl.
 10:  *
 11:  * @author   Anil Madhavapeddy <anil@recoil.org>
 12:  * @author   Jon Parise <jon@horde.org>
 13:  * @author   Michael Slusarz <slusarz@horde.org>
 14:  * @category Horde
 15:  * @license  http://www.horde.org/licenses/lgpl21 LGPL 2.1
 16:  * @package  Mime_Viewer
 17:  */
 18: class Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Base
 19: {
 20:     /**
 21:      * This driver's display capabilities.
 22:      *
 23:      * @var array
 24:      */
 25:     protected $_capability = array(
 26:         'full' => true,
 27:         'info' => false,
 28:         'inline' => true,
 29:         'raw' => false
 30:     );
 31: 
 32:     /**
 33:      * The CSS used to display the phishing warning.
 34:      *
 35:      * @var string
 36:      */
 37:     protected $_phishCss = 'padding: 1px;margin-bottom: 3px;font-size: 90%;border: 1px solid #800;background: #e81222;color: #fff;width: 100%;';
 38: 
 39:     /**
 40:      * Phishing status of last call to _phishingCheck().
 41:      *
 42:      * @var boolean
 43:      */
 44:     protected $_phishWarn = false;
 45: 
 46:     /**
 47:      * Temp array for storing data when parsing the HTML document.
 48:      *
 49:      * @var array
 50:      */
 51:     protected $_tmp = array();
 52: 
 53:     /**
 54:      * Constructor.
 55:      *
 56:      * @param Horde_Mime_Part $mime_part  The object with the data to be
 57:      *                                    rendered.
 58:      * @param array $conf                 Configuration:
 59:      * <pre>
 60:      * browser - (Horde_Browser) A browser object.
 61:      * external_callback - (callback) A callback function that a href URL is
 62:      *                     passed through. The function must take the original
 63:      *                     URL as the first parameter.
 64:      *                     DEFAULT: No callback
 65:      * </pre>
 66:      *
 67:      * @throws InvalidArgumentException
 68:      */
 69:     public function __construct(Horde_Mime_Part $part, array $conf = array())
 70:     {
 71:         $this->_required = array_merge($this->_required, array(
 72:             'browser'
 73:         ));
 74: 
 75:         parent::__construct($part, $conf);
 76:     }
 77: 
 78:     /**
 79:      * Return the full rendered version of the Horde_Mime_Part object.
 80:      *
 81:      * @return array  See parent::render().
 82:      */
 83:     protected function _render()
 84:     {
 85:         $html = $this->_cleanHTML($this->_mimepart->getContents(), array('inline' => false));
 86: 
 87:         return $this->_renderReturn(
 88:             $html,
 89:             $this->_mimepart->getType(true)
 90:         );
 91:     }
 92: 
 93:     /**
 94:      * Return the rendered inline version of the Horde_Mime_Part object.
 95:      *
 96:      * @return array  See parent::render().
 97:      */
 98:     protected function _renderInline()
 99:     {
100:         $html = $this->_cleanHTML($this->_mimepart->getContents(), array('inline' => true));
101: 
102:         return $this->_renderReturn(
103:             Horde_String::convertCharset($html, $this->_mimepart->getCharset(), 'UTF-8'),
104:             'text/html; charset=UTF-8'
105:         );
106:     }
107: 
108:     /**
109:      * Filters active content, dereferences external links, detects phishing,
110:      * etc.
111:      *
112:      * @todo Use IP checks from
113:      * http://lxr.mozilla.org/mailnews/source/mail/base/content/phishingDetector.js.
114:      *
115:      * @param string $data    The HTML data.
116:      * @param array $options  Additional options:
117:      * <pre>
118:      * 'charset' - (string) The charset of $data.
119:      *             DEFAULT: The base part charset.
120:      * 'inline' - (boolean) Are we viewing inline?
121:      *            DEFAULT: false
122:      * 'noprefetch' - (boolean) Disable DNS prefetching?
123:      *                DEFAULT: false
124:      * 'phishing' - (boolean) Do phishing highlighting even if not viewing
125:      *              inline.
126:      *              DEFAULT: false.
127:      * </pre>
128:      *
129:      * @return string  The cleaned HTML string.
130:      */
131:     protected function _cleanHTML($data, $options = array())
132:     {
133:         $browser = $this->getConfigParam('browser');
134:         if (($tidy_limit = $this->getConfigParam('tidy_size_limit')) === null) {
135:             $tidy_limit = false;
136:         }
137:         $charset = isset($options['charset'])
138:             ? $options['charset']
139:             : $this->_mimepart->getCharset();
140:         $strip_style_attributes =
141:             (!empty($options['inline']) &&
142:              (($browser->isBrowser('mozilla') &&
143:               ($browser->getMajor() == 4)) ||
144:               $browser->isBrowser('msie')));
145: 
146:         $data = $this->_textFilter($data, array('cleanhtml', 'xss'), array(
147:             array(
148:                 'charset' => $charset,
149:                 'size' => $tidy_limit
150:             ),
151:             array(
152:                 'charset' => $charset,
153:                 'noprefetch' => !empty($options['noprefetch']),
154:                 'return_dom' => true,
155:                 'strip_styles' => (!empty($options['inline']) || $strip_style_attributes),
156:                 'strip_style_attributes' => $strip_style_attributes
157:             )
158:         ));
159: 
160:         $this->_tmp = array(
161:             'base' => null,
162:             'inline' => !empty($options['inline']),
163:             'phish' => ((!empty($options['inline']) || !empty($options['phishing'])) && $this->getConfigParam('phishing_check'))
164:         );
165:         $this->_phishWarn = false;
166: 
167:         $this->_node($data->dom, $data->dom);
168: 
169:         return $data->returnHtml();
170:     }
171: 
172:     /**
173:      * Process DOM node.
174:      *
175:      * @param DOMDocument $doc  Document node.
176:      * @param DOMNode $node     Node.
177:      */
178:     protected function _node($doc, $node)
179:     {
180:         if ($node->hasChildNodes()) {
181:             /* Iterate in the reverse direction through the node list. This
182:              * allows us to alter the original list without breaking things
183:              * (foreach() w/removeChild() may exit iteration after the removal
184:              * is completed). */
185:             for ($i = $node->childNodes->length; $i-- > 0;) {
186:                 $child = $node->childNodes->item($i);
187: 
188:                 if ($child instanceof DOMElement) {
189:                     switch (strtolower($child->tagName)) {
190:                     case 'a':
191:                         /* Strip whitespace from href links. This is bad HTML,
192:                          * but may prevent viewing of the link. PHP DOM will
193:                          * already strip this out for us, but if using tidy it
194:                          * will have URL encoded the spaces. */
195:                         if ($child->hasAttribute('href')) {
196:                             $child->setAttribute('href', preg_replace('/^(\%20)+/', '', trim($child->getAttribute('href'))));
197:                         }
198:                         break;
199: 
200:                     case 'base':
201:                         /* Deal with <base> tags in the HTML, since they will
202:                          * screw up our own relative paths. */
203:                         if ($this->_tmp['inline'] &&
204:                             $child->hasAttribute('href')) {
205:                             $base = $child->getAttribute('href');
206:                             if (substr($base, -1) != '/') {
207:                                 $base .= '/';
208:                             }
209: 
210:                             $this->_tmp['base'] = $base;
211:                             $child->removeAttribute('href');
212:                         }
213:                         break;
214:                     }
215: 
216:                     foreach ($child->attributes as $val) {
217:                         /* Attempt to fix paths that were relying on a <base>
218:                          * tag. */
219:                         if (!is_null($this->_tmp['base']) &&
220:                             in_array($val->name, array('href', 'src'))) {
221:                             $child->setAttribute($val->name, $this->_tmp['base'] . ltrim($val->value, '/'));
222:                         }
223: 
224:                         if ($val->name == 'href') {
225:                             if ($this->_tmp['phish'] &&
226:                                 $this->_phishingCheck($val->value, $child->textContent)) {
227:                                 $this->_phishWarn = true;
228:                                 $child->setAttribute('style', ($child->hasAttribute('style') ? rtrim($child->getAttribute('style'), '; ') . ';' : '') . $this->_phishCss);
229:                             }
230: 
231:                             if (isset($this->_params['external_callback'])) {
232:                                 /* Try to derefer all external references. */
233:                                 $child->setAttribute('href', call_user_func($this->_params['external_callback'], $val->value));
234:                             }
235:                         }
236:                     }
237:                 }
238: 
239:                 $this->_nodeCallback($doc, $child);
240: 
241:                 // _nodeCallback() may have removed the node.
242:                 if ($node->childNodes->item($i)) {
243:                     $this->_node($doc, $child);
244:                 }
245:             }
246:         }
247:     }
248: 
249:     /**
250:      * Process DOM node (callback).
251:      *
252:      * @param DOMDocument $doc  Document node.
253:      * @param DOMNode $node     Node.
254:      */
255:     protected function _nodeCallback($doc, $node)
256:     {
257:     }
258: 
259:     /**
260:      * Check for phishing exploits.
261:      *
262:      * @param string $href  The HREF value.
263:      * @param string $text  The text value of the link.
264:      *
265:      * @return boolean  True if phishing is detected.
266:      */
267:     protected function _phishingCheck($href, $text)
268:     {
269:         /* For phishing, we are checking whether the displayable text URL is
270:          * the same as the HREF URL. If we can't parse the text URL, then we
271:          * can't do phishing checks. */
272:         $text_url = @parse_url($text);
273:         if (!$text_url) {
274:             return false;
275:         }
276: 
277:         $href_url = parse_url($href);
278:         if (!isset($href_url['host'])) {
279:             $href_url['host'] = '';
280:         }
281: 
282:         /* Only concern ourselves with HTTP and FTP links. */
283:         if (!isset($href_url['scheme']) ||
284:             !in_array($href_url['scheme'], array('ftp', 'http', 'https'))) {
285:             return false;
286:         }
287: 
288:         /* Check for case where text is just the domain name. */
289:         if (!isset($text_url['host'])) {
290:             if (!isset($text_url['path'])) {
291:                 return false;
292:             }
293: 
294:             /* Path info may include path, so remove that. */
295:             if (($pos = strpos($text_url['path'], '/')) !== false) {
296:                 $text_url['path'] = substr($text_url['path'], 0, $pos);
297:             }
298: 
299:             if (!preg_match("/^[^\.\s\/]+(?:\.[^\.\s]+)+$/", $text_url['path'])) {
300:                 return false;
301:             }
302: 
303:             $text_url['host'] = $text_url['path'];
304:         }
305: 
306:         /* If port exists on link, and text link has scheme or port defined,
307:          * do extra checks:
308:          * 1. If port exists on text link, and doesn't match, this is
309:          * phishing.
310:          * 2. If port doesn't exist on text link, and port does not match
311:          * defaults, this is phishing. */
312:         if (isset($href_url['port']) &&
313:             (isset($text_url['scheme']) || isset($text_url['port']))) {
314:             if (!isset($text_url['port'])) {
315:                 switch ($text_url['scheme']) {
316:                 case 'ftp':
317:                     $text_url['port'] = 25;
318:                     break;
319: 
320:                 case 'http':
321:                     $text_url['port'] = 80;
322:                     break;
323: 
324:                 case 'https':
325:                     $text_url['port'] = 443;
326:                     break;
327:                 }
328:             }
329: 
330:             if ($href_url['port'] != $text_url['port']) {
331:                 return false;
332:             }
333:         }
334: 
335:         if (strcasecmp($href_url['host'], $text_url['host']) === 0) {
336:             return false;
337:         }
338: 
339:         /* Don't consider the link a phishing link if the domain is the same
340:          * on both links (e.g. adtracking.example.com & www.example.com). */
341:         $host1 = explode('.', $href_url['host']);
342:         $host2 = explode('.', $text_url['host']);
343: 
344:         return (strcasecmp(implode('.', array_slice($host1, -2)), implode('.', array_slice($host2, -2))) !== 0);
345:     }
346: 
347: }
348: 
API documentation generated by ApiGen