1: <?php
2: /**
3: * Removes some common entities and high-ascii or otherwise nonstandard
4: * characters common in text pasted from Microsoft Word into a browser.
5: *
6: * This function should NOT be used on non-ASCII text; it may and probably
7: * will butcher other character sets indescriminately. Use it only to clean
8: * US-ASCII (7-bit) text which you suspect (or know) may have invalid or
9: * non-printing characters in it.
10: *
11: * Copyright 2004-2012 Horde LLC (http://www.horde.org/)
12: *
13: * See the enclosed file COPYING for license information (LGPL). If you
14: * did not receive this file, see http://www.horde.org/licenses/lgpl21.
15: *
16: * @author Jan Schneider <jan@horde.org>
17: * @category Horde
18: * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
19: * @package Text_Filter
20: */
21: class Horde_Text_Filter_Cleanascii extends Horde_Text_Filter_Base
22: {
23: /**
24: * Executes any code necessary before applying the filter patterns.
25: *
26: * @param string $text The text before the filtering.
27: *
28: * @return string The modified text.
29: */
30: public function preProcess($text)
31: {
32: if (preg_match('/|([^#]*)#.*/', $text, $regs)) {
33: $text = $regs[1];
34:
35: if (!empty($text)) {
36: $text = $text . "\n";
37: }
38: }
39:
40: return $text;
41: }
42:
43: /**
44: * Returns a hash with replace patterns.
45: *
46: * @return array Patterns hash.
47: */
48: public function getPatterns()
49: {
50: /* Remove control characters. */
51: $regexp = array('/[\x00-\x1f]+/' => '');
52:
53: /*
54: * but it's not - that's not really a single quote. */
55: $replace = array(
56: chr(150) => '-',
57: chr(167) => '*',
58: '' => '*',
59: '' => '...',
60: '' => "'",
61: '' => "'",
62: '' => '"',
63: '' => '"',
64: '' => '*',
65: '' => '-',
66: '' => '-',
67: '' => '*',
68: '' => '.',
69: '' => '*',
70: '' => '*',
71: '' => '-',
72: '' => '-',
73: '' => '*',
74: '' => '*',
75: '' => '*',
76: '•' => '*',
77: '►' => '>',
78: );
79:
80: return array('regexp' => $regexp, 'replace' => $replace);
81: }
82:
83: }
84: