File Horde/String.php

  1: <?php
  2: /**
  3:  * Provides static methods for charset and locale safe string manipulation.
  4:  *
  5:  * Copyright 2003-2012 Horde LLC (http://www.horde.org/)
  6:  *
  7:  * See the enclosed file COPYING for license information (LGPL). If you
  8:  * did not receive this file, see http://www.horde.org/licenses/lgpl21.
  9:  *
 10:  * @author   Jan Schneider <jan@horde.org>
 11:  * @category Horde
 12:  * @license  http://www.horde.org/licenses/lgpl21 LGPL 2.1
 13:  * @package  Util
 14:  */
 15: class Horde_String
 16: {
 17:     /**
 18:      * lower() cache.
 19:      *
 20:      * @var array
 21:      */
 22:     static protected $_lowers = array();
 23: 
 24:     /**
 25:      * upper() cache.
 26:      *
 27:      * @var array
 28:      */
 29:     static protected $_uppers = array();
 30: 
 31:     /**
 32:      * Converts a string from one charset to another.
 33:      *
 34:      * Uses the iconv or the mbstring extensions.
 35:      * The original string is returned if conversion failed or none
 36:      * of the extensions were available.
 37:      *
 38:      * @param mixed $input    The data to be converted. If $input is an an
 39:      *                        array, the array's values get converted
 40:      *                        recursively.
 41:      * @param string $from    The string's current charset.
 42:      * @param string $to      The charset to convert the string to.
 43:      * @param boolean $force  Force conversion?
 44:      *
 45:      * @return mixed  The converted input data.
 46:      */
 47:     static public function convertCharset($input, $from, $to, $force = false)
 48:     {
 49:         /* Don't bother converting numbers. */
 50:         if (is_numeric($input)) {
 51:             return $input;
 52:         }
 53: 
 54:         /* If the from and to character sets are identical, return now. */
 55:         if (!$force && $from == $to) {
 56:             return $input;
 57:         }
 58:         $from = self::lower($from);
 59:         $to = self::lower($to);
 60:         if (!$force && $from == $to) {
 61:             return $input;
 62:         }
 63: 
 64:         if (is_array($input)) {
 65:             $tmp = array();
 66:             reset($input);
 67:             while (list($key, $val) = each($input)) {
 68:                 $tmp[self::_convertCharset($key, $from, $to)] = self::convertCharset($val, $from, $to, $force);
 69:             }
 70:             return $tmp;
 71:         }
 72: 
 73:         if (is_object($input)) {
 74:             // PEAR_Error/Exception objects are almost guaranteed to contain
 75:             // recursion, which will cause a segfault in PHP. We should never
 76:             // reach this line, but add a check.
 77:             if (($input instanceof Exception) ||
 78:                 ($input instanceof PEAR_Error)) {
 79:                 return '';
 80:             }
 81: 
 82:             $input = Horde_Util::cloneObject($input);
 83:             $vars = get_object_vars($input);
 84:             while (list($key, $val) = each($vars)) {
 85:                 $input->$key = self::convertCharset($val, $from, $to, $force);
 86:             }
 87:             return $input;
 88:         }
 89: 
 90:         if (!is_string($input)) {
 91:             return $input;
 92:         }
 93: 
 94:         return self::_convertCharset($input, $from, $to);
 95:     }
 96: 
 97:     /**
 98:      * Internal function used to do charset conversion.
 99:      *
100:      * @param string $input  See self::convertCharset().
101:      * @param string $from   See self::convertCharset().
102:      * @param string $to     See self::convertCharset().
103:      *
104:      * @return string  The converted string.
105:      */
106:     static protected function _convertCharset($input, $from, $to)
107:     {
108:         /* Use utf8_[en|de]code() if possible and if the string isn't too
109:          * large (less than 16 MB = 16 * 1024 * 1024 = 16777216 bytes) - these
110:          * functions use more memory. */
111:         if (Horde_Util::extensionExists('xml') &&
112:             ((strlen($input) < 16777216) ||
113:              !Horde_Util::extensionExists('iconv') ||
114:              !Horde_Util::extensionExists('mbstring'))) {
115:             if (($to == 'utf-8') &&
116:                 in_array($from, array('iso-8859-1', 'us-ascii', 'utf-8'))) {
117:                 return utf8_encode($input);
118:             }
119: 
120:             if (($from == 'utf-8') &&
121:                 in_array($to, array('iso-8859-1', 'us-ascii', 'utf-8'))) {
122:                 return utf8_decode($input);
123:             }
124:         }
125: 
126:         /* Try UTF7-IMAP conversions. */
127:         if (($from == 'utf7-imap') || ($to == 'utf7-imap')) {
128:             try {
129:                 if ($from == 'utf7-imap') {
130:                     return self::convertCharset(Horde_Imap_Client_Utf7imap::Utf7ImapToUtf8($input), 'UTF-8', $to);
131:                 } else {
132:                     if ($from == 'utf-8') {
133:                         $conv = $input;
134:                     } else {
135:                         $conv = self::convertCharset($input, $from, 'UTF-8');
136:                     }
137:                     return Horde_Imap_Client_Utf7imap::Utf8ToUtf7Imap($conv);
138:                 }
139:             } catch (Horde_Imap_Client_Exception $e) {
140:                 return $input;
141:             }
142:         }
143: 
144:         /* Try iconv with transliteration. */
145:         if (Horde_Util::extensionExists('iconv')) {
146:             unset($php_errormsg);
147:             ini_set('track_errors', 1);
148:             $out = @iconv($from, $to . '//TRANSLIT', $input);
149:             $errmsg = isset($php_errormsg);
150:             ini_restore('track_errors');
151:             if (!$errmsg) {
152:                 return $out;
153:             }
154:         }
155: 
156:         /* Try mbstring. */
157:         if (Horde_Util::extensionExists('mbstring')) {
158:             $out = @mb_convert_encoding($input, $to, self::_mbstringCharset($from));
159:             if (!empty($out)) {
160:                 return $out;
161:             }
162:         }
163: 
164:         return $input;
165:     }
166: 
167:     /**
168:      * Makes a string lowercase.
169:      *
170:      * @param string $string   The string to be converted.
171:      * @param boolean $locale  If true the string will be converted based on
172:      *                         a given charset, locale independent else.
173:      * @param string $charset  If $locale is true, the charset to use when
174:      *                         converting.
175:      *
176:      * @return string  The string with lowercase characters.
177:      */
178:     static public function lower($string, $locale = false, $charset = null)
179:     {
180:         if ($locale) {
181:             if (Horde_Util::extensionExists('mbstring')) {
182:                 if (is_null($charset)) {
183:                     throw new InvalidArgumentException('$charset argument must not be null');
184:                 }
185:                 $ret = @mb_strtolower($string, self::_mbstringCharset($charset));
186:                 if (!empty($ret)) {
187:                     return $ret;
188:                 }
189:             }
190:             return strtolower($string);
191:         }
192: 
193:         if (!isset(self::$_lowers[$string])) {
194:             $language = setlocale(LC_CTYPE, 0);
195:             setlocale(LC_CTYPE, 'C');
196:             self::$_lowers[$string] = strtolower($string);
197:             setlocale(LC_CTYPE, $language);
198:         }
199: 
200:         return self::$_lowers[$string];
201:     }
202: 
203:     /**
204:      * Makes a string uppercase.
205:      *
206:      * @param string $string   The string to be converted.
207:      * @param boolean $locale  If true the string will be converted based on a
208:      *                         given charset, locale independent else.
209:      * @param string $charset  If $locale is true, the charset to use when
210:      *                         converting. If not provided the current charset.
211:      *
212:      * @return string  The string with uppercase characters.
213:      */
214:     static public function upper($string, $locale = false, $charset = null)
215:     {
216:         if ($locale) {
217:             if (Horde_Util::extensionExists('mbstring')) {
218:                 if (is_null($charset)) {
219:                     throw new InvalidArgumentException('$charset argument must not be null');
220:                 }
221:                 $ret = @mb_strtoupper($string, self::_mbstringCharset($charset));
222:                 if (!empty($ret)) {
223:                     return $ret;
224:                 }
225:             }
226:             return strtoupper($string);
227:         }
228: 
229:         if (!isset(self::$_uppers[$string])) {
230:             $language = setlocale(LC_CTYPE, 0);
231:             setlocale(LC_CTYPE, 'C');
232:             self::$_uppers[$string] = strtoupper($string);
233:             setlocale(LC_CTYPE, $language);
234:         }
235: 
236:         return self::$_uppers[$string];
237:     }
238: 
239:     /**
240:      * Returns a string with the first letter capitalized if it is
241:      * alphabetic.
242:      *
243:      * @param string $string   The string to be capitalized.
244:      * @param boolean $locale  If true the string will be converted based on a
245:      *                         given charset, locale independent else.
246:      * @param string $charset  The charset to use, defaults to current charset.
247:      *
248:      * @return string  The capitalized string.
249:      */
250:     static public function ucfirst($string, $locale = false, $charset = null)
251:     {
252:         if ($locale) {
253:             if (is_null($charset)) {
254:                 throw new InvalidArgumentException('$charset argument must not be null');
255:             }
256:             $first = self::substr($string, 0, 1, $charset);
257:             if (self::isAlpha($first, $charset)) {
258:                 $string = self::upper($first, true, $charset) . self::substr($string, 1, null, $charset);
259:             }
260:         } else {
261:             $string = self::upper(substr($string, 0, 1), false) . substr($string, 1);
262:         }
263: 
264:         return $string;
265:     }
266: 
267:     /**
268:      * Returns a string with the first letter of each word capitalized if it is
269:      * alphabetic.
270:      *
271:      * Sentences are splitted into words at whitestrings.
272:      *
273:      * @param string $string   The string to be capitalized.
274:      * @param boolean $locale  If true the string will be converted based on a
275:      *                         given charset, locale independent else.
276:      * @param string $charset  The charset to use, defaults to current charset.
277:      *
278:      * @return string  The capitalized string.
279:      */
280:     static public function ucwords($string, $locale = false, $charset = null)
281:     {
282:         $words = preg_split('/(\s+)/', $string, -1, PREG_SPLIT_DELIM_CAPTURE);
283:         for ($i = 0, $c = count($words); $i < $c; $i += 2) {
284:             $words[$i] = self::ucfirst($words[$i], $locale, $charset);
285:         }
286:         return implode('', $words);
287:     }
288: 
289:     /**
290:      * Returns part of a string.
291:      *
292:      * @param string $string   The string to be converted.
293:      * @param integer $start   The part's start position, zero based.
294:      * @param integer $length  The part's length.
295:      * @param string $charset  The charset to use when calculating the part's
296:      *                         position and length, defaults to current
297:      *                         charset.
298:      *
299:      * @return string  The string's part.
300:      */
301:     static public function substr($string, $start, $length = null,
302:                                   $charset = 'UTF-8')
303:     {
304:         if (is_null($length)) {
305:             $length = self::length($string, $charset) - $start;
306:         }
307: 
308:         if ($length == 0) {
309:             return '';
310:         }
311: 
312:         /* Try mbstring. */
313:         if (Horde_Util::extensionExists('mbstring')) {
314:             $ret = @mb_substr($string, $start, $length, self::_mbstringCharset($charset));
315: 
316:             /* mb_substr() returns empty string on failure. */
317:             if (strlen($ret)) {
318:                 return $ret;
319:             }
320:         }
321: 
322:         /* Try iconv. */
323:         if (Horde_Util::extensionExists('iconv')) {
324:             $ret = @iconv_substr($string, $start, $length, $charset);
325: 
326:             /* iconv_substr() returns false on failure. */
327:             if ($ret !== false) {
328:                 return $ret;
329:             }
330:         }
331: 
332:         return substr($string, $start, $length);
333:     }
334: 
335:     /**
336:      * Returns the character (not byte) length of a string.
337:      *
338:      * @param string $string  The string to return the length of.
339:      * @param string $charset The charset to use when calculating the string's
340:      *                        length.
341:      *
342:      * @return integer  The string's length.
343:      */
344:     static public function length($string, $charset = 'UTF-8')
345:     {
346:         $charset = self::lower($charset);
347: 
348:         if ($charset == 'utf-8' || $charset == 'utf8') {
349:             return strlen(utf8_decode($string));
350:         }
351: 
352:         if (Horde_Util::extensionExists('mbstring')) {
353:             $ret = @mb_strlen($string, self::_mbstringCharset($charset));
354:             if (!empty($ret)) {
355:                 return $ret;
356:             }
357:         }
358: 
359:         return strlen($string);
360:     }
361: 
362:     /**
363:      * Returns the numeric position of the first occurrence of $needle
364:      * in the $haystack string.
365:      *
366:      * @param string $haystack  The string to search through.
367:      * @param string $needle    The string to search for.
368:      * @param integer $offset   Allows to specify which character in haystack
369:      *                          to start searching.
370:      * @param string $charset   The charset to use when searching for the
371:      *                          $needle string.
372:      *
373:      * @return integer  The position of first occurrence.
374:      */
375:     static public function pos($haystack, $needle, $offset = 0,
376:                                $charset = 'UTF-8')
377:     {
378:         if (Horde_Util::extensionExists('mbstring')) {
379:             $track_errors = ini_set('track_errors', 1);
380:             $ret = @mb_strpos($haystack, $needle, $offset, self::_mbstringCharset($charset));
381:             ini_set('track_errors', $track_errors);
382:             if (!isset($php_errormsg)) {
383:                 return $ret;
384:             }
385:         }
386: 
387:         return strpos($haystack, $needle, $offset);
388:     }
389: 
390:     /**
391:      * Returns the numeric position of the last occurrence of $needle
392:      * in the $haystack string.
393:      *
394:      * @param string $haystack  The string to search through.
395:      * @param string $needle    The string to search for.
396:      * @param integer $offset   Allows to specify which character in haystack
397:      *                          to start searching.
398:      * @param string $charset   The charset to use when searching for the
399:      *                          $needle string.
400:      *
401:      * @return integer  The position of first occurrence.
402:      */
403:     static public function rpos($haystack, $needle, $offset = 0,
404:                                 $charset = 'UTF-8')
405:     {
406:         if (Horde_Util::extensionExists('mbstring')) {
407:             $track_errors = ini_set('track_errors', 1);
408:             $ret = @mb_strrpos($haystack, $needle, $offset, self::_mbstringCharset($charset));
409:             ini_set('track_errors', $track_errors);
410:             if (!isset($php_errormsg)) {
411:                 return $ret;
412:             }
413:         }
414: 
415:         return strrpos($haystack, $needle, $offset);
416:     }
417: 
418:     /**
419:      * Returns a string padded to a certain length with another string.
420:      * This method behaves exactly like str_pad() but is multibyte safe.
421:      *
422:      * @param string $input    The string to be padded.
423:      * @param integer $length  The length of the resulting string.
424:      * @param string $pad      The string to pad the input string with. Must
425:      *                         be in the same charset like the input string.
426:      * @param const $type      The padding type. One of STR_PAD_LEFT,
427:      *                         STR_PAD_RIGHT, or STR_PAD_BOTH.
428:      * @param string $charset  The charset of the input and the padding
429:      *                         strings.
430:      *
431:      * @return string  The padded string.
432:      */
433:     static public function pad($input, $length, $pad = ' ',
434:                                $type = STR_PAD_RIGHT, $charset = 'UTF-8')
435:     {
436:         $mb_length = self::length($input, $charset);
437:         $sb_length = strlen($input);
438:         $pad_length = self::length($pad, $charset);
439: 
440:         /* Return if we already have the length. */
441:         if ($mb_length >= $length) {
442:             return $input;
443:         }
444: 
445:         /* Shortcut for single byte strings. */
446:         if ($mb_length == $sb_length && $pad_length == strlen($pad)) {
447:             return str_pad($input, $length, $pad, $type);
448:         }
449: 
450:         switch ($type) {
451:         case STR_PAD_LEFT:
452:             $left = $length - $mb_length;
453:             $output = self::substr(str_repeat($pad, ceil($left / $pad_length)), 0, $left, $charset) . $input;
454:             break;
455: 
456:         case STR_PAD_BOTH:
457:             $left = floor(($length - $mb_length) / 2);
458:             $right = ceil(($length - $mb_length) / 2);
459:             $output = self::substr(str_repeat($pad, ceil($left / $pad_length)), 0, $left, $charset) .
460:                 $input .
461:                 self::substr(str_repeat($pad, ceil($right / $pad_length)), 0, $right, $charset);
462:             break;
463: 
464:         case STR_PAD_RIGHT:
465:             $right = $length - $mb_length;
466:             $output = $input . self::substr(str_repeat($pad, ceil($right / $pad_length)), 0, $right, $charset);
467:             break;
468:         }
469: 
470:         return $output;
471:     }
472: 
473:     /**
474:      * Wraps the text of a message.
475:      *
476:      * @param string $string         String containing the text to wrap.
477:      * @param integer $width         Wrap the string at this number of
478:      *                               characters.
479:      * @param string $break          Character(s) to use when breaking lines.
480:      * @param boolean $cut           Whether to cut inside words if a line
481:      *                               can't be wrapped.
482:      * @param boolean $line_folding  Whether to apply line folding rules per
483:      *                               RFC 822 or similar. The correct break
484:      *                               characters including leading whitespace
485:      *                               have to be specified too.
486:      *
487:      * @return string  String containing the wrapped text.
488:      */
489:     static public function wordwrap($string, $width = 75, $break = "\n",
490:                                     $cut = false, $line_folding = false)
491:     {
492:         $wrapped = '';
493: 
494:         while (self::length($string, 'UTF-8') > $width) {
495:             $line = self::substr($string, 0, $width, 'UTF-8');
496:             $string = self::substr($string, self::length($line, 'UTF-8'), null, 'UTF-8');
497: 
498:             // Make sure we didn't cut a word, unless we want hard breaks
499:             // anyway.
500:             if (!$cut && preg_match('/^(.+?)((\s|\r?\n).*)/us', $string, $match)) {
501:                 $line .= $match[1];
502:                 $string = $match[2];
503:             }
504: 
505:             // Wrap at existing line breaks.
506:             if (preg_match('/^(.*?)(\r?\n)(.*)$/su', $line, $match)) {
507:                 $wrapped .= $match[1] . $match[2];
508:                 $string = $match[3] . $string;
509:                 continue;
510:             }
511: 
512:             // Wrap at the last colon or semicolon followed by a whitespace if
513:             // doing line folding.
514:             if ($line_folding &&
515:                 preg_match('/^(.*?)(;|:)(\s+.*)$/u', $line, $match)) {
516:                 $wrapped .= $match[1] . $match[2] . $break;
517:                 $string = $match[3] . $string;
518:                 continue;
519:             }
520: 
521:             // Wrap at the last whitespace of $line.
522:             $sub = $line_folding
523:                 ? '(.+[^\s])'
524:                 : '(.*)';
525: 
526:             if (preg_match('/^' . $sub . '(\s+)(.*)$/u', $line, $match)) {
527:                 $wrapped .= $match[1] . $break;
528:                 $string = ($line_folding ? $match[2] : '') . $match[3] . $string;
529:                 continue;
530:             }
531: 
532:             // Hard wrap if necessary.
533:             if ($cut) {
534:                 $wrapped .= $line . $break;
535:                 continue;
536:             }
537: 
538:             $wrapped .= $line;
539:         }
540: 
541:         return $wrapped . $string;
542:     }
543: 
544:     /**
545:      * Wraps the text of a message.
546:      *
547:      * @param string $text        String containing the text to wrap.
548:      * @param integer $length     Wrap $text at this number of characters.
549:      * @param string $break_char  Character(s) to use when breaking lines.
550:      * @param boolean $quote      Ignore lines that are wrapped with the '>'
551:      *                            character (RFC 2646)? If true, we don't
552:      *                            remove any padding whitespace at the end of
553:      *                            the string.
554:      *
555:      * @return string  String containing the wrapped text.
556:      */
557:     static public function wrap($text, $length = 80, $break_char = "\n",
558:                                 $quote = false)
559:     {
560:         $paragraphs = array();
561: 
562:         foreach (preg_split('/\r?\n/', $text) as $input) {
563:             if ($quote && (strpos($input, '>') === 0)) {
564:                 $line = $input;
565:             } else {
566:                 /* We need to handle the Usenet-style signature line
567:                  * separately; since the space after the two dashes is
568:                  * REQUIRED, we don't want to trim the line. */
569:                 if ($input != '-- ') {
570:                     $input = rtrim($input);
571:                 }
572:                 $line = self::wordwrap($input, $length, $break_char);
573:             }
574: 
575:             $paragraphs[] = $line;
576:         }
577: 
578:         return implode($break_char, $paragraphs);
579:     }
580: 
581:     /**
582:      * Return a truncated string, suitable for notifications.
583:      *
584:      * @param string $text     The original string.
585:      * @param integer $length  The maximum length.
586:      *
587:      * @return string  The truncated string, if longer than $length.
588:      */
589:     static public function truncate($text, $length = 100)
590:     {
591:         return (self::length($text) > $length)
592:             ? rtrim(self::substr($text, 0, $length - 3)) . '...'
593:             : $text;
594:     }
595: 
596:     /**
597:      * Return an abbreviated string, with characters in the middle of the
598:      * excessively long string replaced by '...'.
599:      *
600:      * @param string $text     The original string.
601:      * @param integer $length  The length at which to abbreviate.
602:      *
603:      * @return string  The abbreviated string, if longer than $length.
604:      */
605:     static public function abbreviate($text, $length = 20)
606:     {
607:         return (self::length($text) > $length)
608:             ? rtrim(self::substr($text, 0, round(($length - 3) / 2))) . '...' . ltrim(self::substr($text, (($length - 3) / 2) * -1))
609:             : $text;
610:     }
611: 
612:     /**
613:      * Returns the common leading part of two strings.
614:      *
615:      * @param string $str1  A string.
616:      * @param string $str2  Another string.
617:      *
618:      * @return string  The start of $str1 and $str2 that is identical in both.
619:      */
620:     static public function common($str1, $str2)
621:     {
622:         for ($result = '', $i = 0;
623:              isset($str1[$i]) && isset($str2[$i]) && $str1[$i] == $str2[$i];
624:              $i++) {
625:             $result .= $str1[$i];
626:         }
627:         return $result;
628:     }
629: 
630:     /**
631:      * Returns true if the every character in the parameter is an alphabetic
632:      * character.
633:      *
634:      * @param string $string   The string to test.
635:      * @param string $charset  The charset to use when testing the string.
636:      *
637:      * @return boolean  True if the parameter was alphabetic only.
638:      */
639:     static public function isAlpha($string, $charset)
640:     {
641:         if (!Horde_Util::extensionExists('mbstring')) {
642:             return ctype_alpha($string);
643:         }
644: 
645:         $charset = self::_mbstringCharset($charset);
646:         $old_charset = mb_regex_encoding();
647: 
648:         if ($charset != $old_charset) {
649:             @mb_regex_encoding($charset);
650:         }
651:         $alpha = !@mb_ereg_match('[^[:alpha:]]', $string);
652:         if ($charset != $old_charset) {
653:             @mb_regex_encoding($old_charset);
654:         }
655: 
656:         return $alpha;
657:     }
658: 
659:     /**
660:      * Returns true if ever character in the parameter is a lowercase letter in
661:      * the current locale.
662:      *
663:      * @param string $string   The string to test.
664:      * @param string $charset  The charset to use when testing the string.
665:      *
666:      * @return boolean  True if the parameter was lowercase.
667:      */
668:     static public function isLower($string, $charset)
669:     {
670:         return ((self::lower($string, true, $charset) === $string) &&
671:                 self::isAlpha($string, $charset));
672:     }
673: 
674:     /**
675:      * Returns true if every character in the parameter is an uppercase letter
676:      * in the current locale.
677:      *
678:      * @param string $string   The string to test.
679:      * @param string $charset  The charset to use when testing the string.
680:      *
681:      * @return boolean  True if the parameter was uppercase.
682:      */
683:     static public function isUpper($string, $charset)
684:     {
685:         return ((self::upper($string, true, $charset) === $string) &&
686:                 self::isAlpha($string, $charset));
687:     }
688: 
689:     /**
690:      * Performs a multibyte safe regex match search on the text provided.
691:      *
692:      * @param string $text     The text to search.
693:      * @param array $regex     The regular expressions to use, without perl
694:      *                         regex delimiters (e.g. '/' or '|').
695:      * @param string $charset  The character set of the text.
696:      *
697:      * @return array  The matches array from the first regex that matches.
698:      */
699:     static public function regexMatch($text, $regex, $charset = null)
700:     {
701:         if (!empty($charset)) {
702:             $regex = self::convertCharset($regex, $charset, 'utf-8');
703:             $text = self::convertCharset($text, $charset, 'utf-8');
704:         }
705: 
706:         $matches = array();
707:         foreach ($regex as $val) {
708:             if (preg_match('/' . $val . '/u', $text, $matches)) {
709:                 break;
710:             }
711:         }
712: 
713:         if (!empty($charset)) {
714:             $matches = self::convertCharset($matches, 'utf-8', $charset);
715:         }
716: 
717:         return $matches;
718:     }
719: 
720:     /**
721:      * Check to see if a string is valid UTF-8.
722:      *
723:      * @since 1.1.0
724:      *
725:      * @param string $text  The text to check.
726:      *
727:      * @return boolean  True if valid UTF-8.
728:      */
729:     static public function validUtf8($text)
730:     {
731:         /* There is bug in PHP/PCRE with larger strings; stack overflow causes
732:          * PHP segfaults. See:
733:          * https://bugs.php.net/bug.php?id=37793
734:          *
735:          * Thus, break string down into smaller chunks instead.
736:          */
737:         $chunk_size = 4000;
738:         $length = strlen($text);
739: 
740:         while ($length > $chunk_size) {
741:             /* Can't use self::substr() here since the input may not be
742:              * proper UTF-8, which is sort of the whole point of this
743:              * method. */
744:             if (!self::validUtf8(substr($text, 0, $chunk_size))) {
745:                 return false;
746:             }
747: 
748:             $text = substr($text, $chunk_size);
749:             $length -= $chunk_size;
750:         }
751: 
752:         /* Regex from:
753:          * http://stackoverflow.com/questions/1523460/ensuring-valid-utf-8-in-php
754:          */
755:         return preg_match('/^(?:
756:               [\x09\x0A\x0D\x20-\x7E]            # ASCII
757:             | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
758:             | \xE0[\xA0-\xBF][\x80-\xBF]         # excluding overlongs
759:             | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
760:             | \xED[\x80-\x9F][\x80-\xBF]         # excluding surrogates
761:             | \xF0[\x90-\xBF][\x80-\xBF]{2}      # planes 1-3
762:             | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
763:             | \xF4[\x80-\x8F][\x80-\xBF]{2}      # plane 16
764:         )*$/xs', $text);
765:     }
766: 
767:     /**
768:      * Workaround charsets that don't work with mbstring functions.
769:      *
770:      * @param string $charset  The original charset.
771:      *
772:      * @return string  The charset to use with mbstring functions.
773:      */
774:     static protected function _mbstringCharset($charset)
775:     {
776:         /* mbstring functions do not handle the 'ks_c_5601-1987' &
777:          * 'ks_c_5601-1989' charsets. However, these charsets are used, for
778:          * example, by various versions of Outlook to send Korean characters.
779:          * Use UHC (CP949) encoding instead. See, e.g.,
780:          * http://lists.w3.org/Archives/Public/ietf-charsets/2001AprJun/0030.html */
781:         return in_array(self::lower($charset), array('ks_c_5601-1987', 'ks_c_5601-1989'))
782:             ? 'UHC'
783:             : $charset;
784:     }
785: 
786:     /**
787:      * Strip UTF-8 byte order mark (BOM) from string data.
788:      *
789:      * @since 1.4.0
790:      *
791:      * @param string $str  Input string (UTF-8).
792:      *
793:      * @return string  Stripped string (UTF-8).
794:      */
795:     static public function trimUtf8Bom($str)
796:     {
797:         return (substr($str, 0, 3) == pack('CCC', 239, 187, 191))
798:             ? substr($str, 3)
799:             : $str;
800:     }
801: 
802: }
803:
Packages

Classes