1: <?php
2: /**
3: * The Horde_Mime_Viewer_Html class renders out HTML text with an effort to
4: * remove potentially malicious code.
5: *
6: * Copyright 1999-2012 Horde LLC (http://www.horde.org/)
7: *
8: * See the enclosed file COPYING for license information (GPL). If you
9: * did not receive this file, see http://www.horde.org/licenses/gpl.
10: *
11: * @author Anil Madhavapeddy <anil@recoil.org>
12: * @author Jon Parise <jon@horde.org>
13: * @author Michael Slusarz <slusarz@horde.org>
14: * @category Horde
15: * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
16: * @package Mime_Viewer
17: */
18: class Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Base
19: {
20: /**
21: * This driver's display capabilities.
22: *
23: * @var array
24: */
25: protected $_capability = array(
26: 'full' => true,
27: 'info' => false,
28: 'inline' => true,
29: 'raw' => false
30: );
31:
32: /**
33: * The CSS used to display the phishing warning.
34: *
35: * @var string
36: */
37: protected $_phishCss = 'padding: 1px;margin-bottom: 3px;font-size: 90%;border: 1px solid #800;background: #e81222;color: #fff;width: 100%;';
38:
39: /**
40: * Phishing status of last call to _phishingCheck().
41: *
42: * @var boolean
43: */
44: protected $_phishWarn = false;
45:
46: /**
47: * Temp array for storing data when parsing the HTML document.
48: *
49: * @var array
50: */
51: protected $_tmp = array();
52:
53: /**
54: * Constructor.
55: *
56: * @param Horde_Mime_Part $mime_part The object with the data to be
57: * rendered.
58: * @param array $conf Configuration:
59: * <pre>
60: * browser - (Horde_Browser) A browser object.
61: * external_callback - (callback) A callback function that a href URL is
62: * passed through. The function must take the original
63: * URL as the first parameter.
64: * DEFAULT: No callback
65: * </pre>
66: *
67: * @throws InvalidArgumentException
68: */
69: public function __construct(Horde_Mime_Part $part, array $conf = array())
70: {
71: $this->_required = array_merge($this->_required, array(
72: 'browser'
73: ));
74:
75: parent::__construct($part, $conf);
76: }
77:
78: /**
79: * Return the full rendered version of the Horde_Mime_Part object.
80: *
81: * @return array See parent::render().
82: */
83: protected function _render()
84: {
85: $html = $this->_cleanHTML($this->_mimepart->getContents(), array('inline' => false));
86:
87: return $this->_renderReturn(
88: $html,
89: $this->_mimepart->getType(true)
90: );
91: }
92:
93: /**
94: * Return the rendered inline version of the Horde_Mime_Part object.
95: *
96: * @return array See parent::render().
97: */
98: protected function _renderInline()
99: {
100: $html = $this->_cleanHTML($this->_mimepart->getContents(), array('inline' => true));
101:
102: return $this->_renderReturn(
103: Horde_String::convertCharset($html, $this->_mimepart->getCharset(), 'UTF-8'),
104: 'text/html; charset=UTF-8'
105: );
106: }
107:
108: /**
109: * Filters active content, dereferences external links, detects phishing,
110: * etc.
111: *
112: * @todo Use IP checks from
113: * http://lxr.mozilla.org/mailnews/source/mail/base/content/phishingDetector.js.
114: *
115: * @param string $data The HTML data.
116: * @param array $options Additional options:
117: * <pre>
118: * 'charset' - (string) The charset of $data.
119: * DEFAULT: The base part charset.
120: * 'inline' - (boolean) Are we viewing inline?
121: * DEFAULT: false
122: * 'noprefetch' - (boolean) Disable DNS prefetching?
123: * DEFAULT: false
124: * 'phishing' - (boolean) Do phishing highlighting even if not viewing
125: * inline.
126: * DEFAULT: false.
127: * </pre>
128: *
129: * @return string The cleaned HTML string.
130: */
131: protected function _cleanHTML($data, $options = array())
132: {
133: $browser = $this->getConfigParam('browser');
134: if (($tidy_limit = $this->getConfigParam('tidy_size_limit')) === null) {
135: $tidy_limit = false;
136: }
137: $charset = isset($options['charset'])
138: ? $options['charset']
139: : $this->_mimepart->getCharset();
140: $strip_style_attributes =
141: (!empty($options['inline']) &&
142: (($browser->isBrowser('mozilla') &&
143: ($browser->getMajor() == 4)) ||
144: $browser->isBrowser('msie')));
145:
146: $data = $this->_textFilter($data, array('cleanhtml', 'xss'), array(
147: array(
148: 'charset' => $charset,
149: 'size' => $tidy_limit
150: ),
151: array(
152: 'charset' => $charset,
153: 'noprefetch' => !empty($options['noprefetch']),
154: 'return_dom' => true,
155: 'strip_styles' => (!empty($options['inline']) || $strip_style_attributes),
156: 'strip_style_attributes' => $strip_style_attributes
157: )
158: ));
159:
160: $this->_tmp = array(
161: 'base' => null,
162: 'inline' => !empty($options['inline']),
163: 'phish' => ((!empty($options['inline']) || !empty($options['phishing'])) && $this->getConfigParam('phishing_check'))
164: );
165: $this->_phishWarn = false;
166:
167: $this->_node($data->dom, $data->dom);
168:
169: return $data->returnHtml();
170: }
171:
172: /**
173: * Process DOM node.
174: *
175: * @param DOMDocument $doc Document node.
176: * @param DOMNode $node Node.
177: */
178: protected function _node($doc, $node)
179: {
180: if ($node->hasChildNodes()) {
181: /* Iterate in the reverse direction through the node list. This
182: * allows us to alter the original list without breaking things
183: * (foreach() w/removeChild() may exit iteration after the removal
184: * is completed). */
185: for ($i = $node->childNodes->length; $i-- > 0;) {
186: $child = $node->childNodes->item($i);
187:
188: if ($child instanceof DOMElement) {
189: switch (strtolower($child->tagName)) {
190: case 'a':
191: /* Strip whitespace from href links. This is bad HTML,
192: * but may prevent viewing of the link. PHP DOM will
193: * already strip this out for us, but if using tidy it
194: * will have URL encoded the spaces. */
195: if ($child->hasAttribute('href')) {
196: $child->setAttribute('href', preg_replace('/^(\%20)+/', '', trim($child->getAttribute('href'))));
197: }
198: break;
199:
200: case 'base':
201: /* Deal with <base> tags in the HTML, since they will
202: * screw up our own relative paths. */
203: if ($this->_tmp['inline'] &&
204: $child->hasAttribute('href')) {
205: $base = $child->getAttribute('href');
206: if (substr($base, -1) != '/') {
207: $base .= '/';
208: }
209:
210: $this->_tmp['base'] = $base;
211: $child->removeAttribute('href');
212: }
213: break;
214: }
215:
216: foreach ($child->attributes as $val) {
217: /* Attempt to fix paths that were relying on a <base>
218: * tag. */
219: if (!is_null($this->_tmp['base']) &&
220: in_array($val->name, array('href', 'src'))) {
221: $child->setAttribute($val->name, $this->_tmp['base'] . ltrim($val->value, '/'));
222: }
223:
224: if ($val->name == 'href') {
225: if ($this->_tmp['phish'] &&
226: $this->_phishingCheck($val->value, $child->textContent)) {
227: $this->_phishWarn = true;
228: $child->setAttribute('style', ($child->hasAttribute('style') ? rtrim($child->getAttribute('style'), '; ') . ';' : '') . $this->_phishCss);
229: }
230:
231: if (isset($this->_params['external_callback'])) {
232: /* Try to derefer all external references. */
233: $child->setAttribute('href', call_user_func($this->_params['external_callback'], $val->value));
234: }
235: }
236: }
237: }
238:
239: $this->_nodeCallback($doc, $child);
240:
241: // _nodeCallback() may have removed the node.
242: if ($node->childNodes->item($i)) {
243: $this->_node($doc, $child);
244: }
245: }
246: }
247: }
248:
249: /**
250: * Process DOM node (callback).
251: *
252: * @param DOMDocument $doc Document node.
253: * @param DOMNode $node Node.
254: */
255: protected function _nodeCallback($doc, $node)
256: {
257: }
258:
259: /**
260: * Check for phishing exploits.
261: *
262: * @param string $href The HREF value.
263: * @param string $text The text value of the link.
264: *
265: * @return boolean True if phishing is detected.
266: */
267: protected function _phishingCheck($href, $text)
268: {
269: /* For phishing, we are checking whether the displayable text URL is
270: * the same as the HREF URL. If we can't parse the text URL, then we
271: * can't do phishing checks. */
272: $text_url = @parse_url($text);
273: if (!$text_url) {
274: return false;
275: }
276:
277: $href_url = parse_url($href);
278: if (!isset($href_url['host'])) {
279: $href_url['host'] = '';
280: }
281:
282: /* Only concern ourselves with HTTP and FTP links. */
283: if (!isset($href_url['scheme']) ||
284: !in_array($href_url['scheme'], array('ftp', 'http', 'https'))) {
285: return false;
286: }
287:
288: /* Check for case where text is just the domain name. */
289: if (!isset($text_url['host'])) {
290: if (!isset($text_url['path'])) {
291: return false;
292: }
293:
294: /* Path info may include path, so remove that. */
295: if (($pos = strpos($text_url['path'], '/')) !== false) {
296: $text_url['path'] = substr($text_url['path'], 0, $pos);
297: }
298:
299: if (!preg_match("/^[^\.\s\/]+(?:\.[^\.\s]+)+$/", $text_url['path'])) {
300: return false;
301: }
302:
303: $text_url['host'] = $text_url['path'];
304: }
305:
306: /* If port exists on link, and text link has scheme or port defined,
307: * do extra checks:
308: * 1. If port exists on text link, and doesn't match, this is
309: * phishing.
310: * 2. If port doesn't exist on text link, and port does not match
311: * defaults, this is phishing. */
312: if (isset($href_url['port']) &&
313: (isset($text_url['scheme']) || isset($text_url['port']))) {
314: if (!isset($text_url['port'])) {
315: switch ($text_url['scheme']) {
316: case 'ftp':
317: $text_url['port'] = 25;
318: break;
319:
320: case 'http':
321: $text_url['port'] = 80;
322: break;
323:
324: case 'https':
325: $text_url['port'] = 443;
326: break;
327: }
328: }
329:
330: if ($href_url['port'] != $text_url['port']) {
331: return false;
332: }
333: }
334:
335: if (strcasecmp($href_url['host'], $text_url['host']) === 0) {
336: return false;
337: }
338:
339: /* Don't consider the link a phishing link if the domain is the same
340: * on both links (e.g. adtracking.example.com & www.example.com). */
341: $host1 = explode('.', $href_url['host']);
342: $host2 = explode('.', $text_url['host']);
343:
344: return (strcasecmp(implode('.', array_slice($host1, -2)), implode('.', array_slice($host2, -2))) !== 0);
345: }
346:
347: }
348: