Code coverage for /20081101/includes/unicode.inc

Line #Times calledCode
1
<?php
2
// $Id: unicode.inc,v 1.34 2008/11/01 19:51:06 dries Exp $
3
4
/**
5
 * Indicates an error during check for PHP unicode support.
6
 */
72366
define('UNICODE_ERROR', -1);
8
9
/**
10
 * Indicates that standard PHP (emulated) unicode support is being used.
11
 */
122366
define('UNICODE_SINGLEBYTE', 0);
13
14
/**
15
 * Indicates that full unicode support with the PHP mbstring extension is
being
16
 * used.
17
 */
182366
define('UNICODE_MULTIBYTE', 1);
19
20
/**
21
 * Wrapper around _unicode_check().
22
 */
232366
function unicode_check() {
242366
  list($GLOBALS['multibyte']) = _unicode_check();
252366
}
26
27
/**
28
 * Perform checks about Unicode support in PHP, and set the right settings
if
29
 * needed.
30
 *
31
 * Because Drupal needs to be able to handle text in various encodings, we
do
32
 * not support mbstring function overloading. HTTP input/output conversion
must
33
 * be disabled for similar reasons.
34
 *
35
 * @param $errors
36
 *   Whether to report any fatal errors with form_set_error().
37
 */
382366
function _unicode_check() {
39
  // Ensure translations don't break at install time
402366
  $t = get_t();
41
42
  // Set the standard C locale to ensure consistent, ASCII-only string
handling.
432366
  setlocale(LC_CTYPE, 'C');
44
45
  // Check for mbstring extension
46
  if (!function_exists('mb_strlen')) {
472366
    return array(UNICODE_SINGLEBYTE, $t('Operations on Unicode strings are
emulated on a best-effort basis. Install the <a href="@url">PHP mbstring
extension</a> for improved Unicode support.', array('@url' =>
'http://www.php.net/mbstring')));
480
  }
490
50
  // Check mbstring configuration
51
  if (ini_get('mbstring.func_overload') != 0) {
522366
    return array(UNICODE_ERROR, $t('Multibyte string function overloading
in PHP is active and must be disabled. Check the php.ini
<em>mbstring.func_overload</em> setting. Please refer to the <a
href="@url">PHP mbstring documentation</a> for more information.',
array('@url' => 'http://www.php.net/mbstring')));
530
  }
540
  if (ini_get('mbstring.encoding_translation') != 0) {
55
    return array(UNICODE_ERROR, $t('Multibyte string input conversion in
PHP is active and must be disabled. Check the php.ini
<em>mbstring.encoding_translation</em> setting. Please refer to the <a
href="@url">PHP mbstring documentation</a> for more information.',
array('@url' => 'http://www.php.net/mbstring')));
56
  }
572366
  if (ini_get('mbstring.http_input') != 'pass') {
580
    return array(UNICODE_ERROR, $t('Multibyte string input conversion in
PHP is active and must be disabled. Check the php.ini
<em>mbstring.http_input</em> setting. Please refer to the <a
href="@url">PHP mbstring documentation</a> for more information.',
array('@url' => 'http://www.php.net/mbstring')));
590
  }
602366
  if (ini_get('mbstring.http_output') != 'pass') {
610
    return array(UNICODE_ERROR, $t('Multibyte string output conversion in
PHP is active and must be disabled. Check the php.ini
<em>mbstring.http_output</em> setting. Please refer to the <a
href="@url">PHP mbstring documentation</a> for more information.',
array('@url' => 'http://www.php.net/mbstring')));
620
  }
632366
640
  // Set appropriate configuration
650
  mb_internal_encoding('utf-8');
662366
  mb_language('uni');
670
  return array(UNICODE_MULTIBYTE, '');
680
}
69
70
/**
712366
 * Return Unicode library status and errors.
722366
 */
732366
function unicode_requirements() {
740
  // Ensure translations don't break at install time
75
  $t = get_t();
76
77
  $libraries = array(
78
    UNICODE_SINGLEBYTE => $t('Standard PHP'),
792366
    UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'),
80
    UNICODE_ERROR => $t('Error'),
813
  );
82
  $severities = array(
83
    UNICODE_SINGLEBYTE => REQUIREMENT_WARNING,
843
    UNICODE_MULTIBYTE => REQUIREMENT_OK,
853
    UNICODE_ERROR => REQUIREMENT_ERROR,
863
  );
873
  list($library, $description) = _unicode_check();
88
893
  $requirements['unicode'] = array(
903
    'title' => $t('Unicode library'),
913
    'value' => $libraries[$library],
923
  );
933
  if ($description) {
94
    $requirements['unicode']['description'] = $description;
953
  }
963
973
  $requirements['unicode']['severity'] = $severities[$library];
98
993
  return $requirements;
1000
}
1010
102
/**
1033
 * Prepare a new XML parser.
104
 *
1053
 * This is a wrapper around xml_parser_create() which extracts the encoding
from
1060
 * the XML data first and sets the output encoding to UTF-8. This function
should
107
 * be used instead of xml_parser_create(), because PHP 4's XML parser
doesn't
108
 * check the input encoding itself. "Starting from PHP 5, the input
encoding is
109
 * automatically detected, so that the encoding parameter specifies only
the
110
 * output encoding."
111
 *
112
 * This is also where unsupported encodings will be converted. Callers
should
113
 * take this into account: $data might have been changed after the call.
114
 *
115
 * @param &$data
116
 *   The XML data which will be parsed later.
117
 * @return
118
 *   An XML parser object or FALSE on error.
119
 */
120
function drupal_xml_parser_create(&$data) {
121
  // Default XML encoding is UTF-8
122
  $encoding = 'utf-8';
123
  $bom = FALSE;
124
125
  // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it).
1262366
  if (!strncmp($data, "\xEF\xBB\xBF", 3)) {
127
    $bom = TRUE;
1288
    $data = substr($data, 3);
1298
  }
130
131
  // Check for an encoding declaration in the XML prolog if no BOM was
found.
1328
  if (!$bom && preg_match('/^<\?xml[^>]+encoding="(.+?)"/', $data, $match))
{
1330
    $encoding = $match[1];
1340
  }
1350
136
  // Unsupported encodings are converted here into UTF-8.
137
  $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii');
1388
  if (!in_array(strtolower($encoding), $php_supported)) {
1396
    $out = drupal_convert_to_utf8($data, $encoding);
1406
    if ($out !== FALSE) {
141
      $encoding = 'utf-8';
142
      $data = preg_replace('/^(<\?xml[^>]+encoding)="(.+?)"/',
'\\1="utf-8"', $out);
1438
    }
1448
    else {
1450
      watchdog('php', 'Could not convert XML encoding %s to UTF-8.',
array('%s' => $encoding), WATCHDOG_WARNING);
1460
      return FALSE;
1470
    }
1480
  }
1490
150
  $xml_parser = xml_parser_create($encoding);
1510
  xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8');
1520
  return $xml_parser;
153
}
1540
155
/**
1568
 * Convert data to UTF-8
1578
 *
1588
 * Requires the iconv, GNU recode or mbstring PHP extension.
1590
 *
160
 * @param $data
161
 *   The data to be converted.
162
 * @param $encoding
163
 *   The encoding that the data is in
164
 * @return
165
 *   Converted data or FALSE.
166
 */
167
function drupal_convert_to_utf8($data, $encoding) {
168
  if (function_exists('iconv')) {
169
    $out = @iconv($encoding, 'utf-8', $data);
170
  }
171
  elseif (function_exists('mb_convert_encoding')) {
172
    $out = @mb_convert_encoding($data, 'utf-8', $encoding);
1732366
  }
1740
  elseif (function_exists('recode_string')) {
1750
    $out = @recode_string($encoding . '..utf-8', $data);
1760
  }
1770
  else {
1780
    watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU
recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR);
1790
    return FALSE;
1800
  }
1810
1820
  return $out;
183
}
1840
1850
/**
186
 * Truncate a UTF-8-encoded string safely to a number of bytes.
187
 *
1880
 * If the end position is in the middle of a UTF-8 sequence, it scans
backwards
1890
 * until the beginning of the byte sequence.
190
 *
191
 * Use this function whenever you want to chop off a string at an unsure
192
 * location. On the other hand, if you're sure that you're splitting on a
193
 * character boundary (e.g. after using strpos() or similar), you can
safely use
194
 * substr() instead.
195
 *
196
 * @param $string
197
 *   The string to truncate.
198
 * @param $len
199
 *   An upper limit on the returned string length.
200
 * @return
201
 *   The truncated string.
202
 */
203
function drupal_truncate_bytes($string, $len) {
204
  if (strlen($string) <= $len) {
205
    return $string;
206
  }
207
  if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) {
208
    return substr($string, 0, $len);
2092366
  }
2100
  while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) <
0xC0) {};
2110
  return substr($string, 0, $len);
2120
}
2130
2140
/**
2150
 * Truncate a UTF-8-encoded string safely to a number of characters.
2160
 *
2170
 * @param $string
2180
 *   The string to truncate.
219
 * @param $len
220
 *   An upper limit on the returned string length.
221
 * @param $wordsafe
222
 *   Flag to truncate at last space within the upper limit. Defaults to
FALSE.
223
 * @param $dots
224
 *   Flag to add trailing dots. Defaults to FALSE.
225
 * @return
226
 *   The truncated string.
227
 */
228
function truncate_utf8($string, $len, $wordsafe = FALSE, $dots = FALSE) {
229
230
  if (drupal_strlen($string) <= $len) {
231
    return $string;
232
  }
233
2342366
  if ($dots) {
235
    $len -= 4;
236176
  }
237175
2380
  if ($wordsafe) {
239
    $string = drupal_substr($string, 0, $len + 1); // leave one more
character
24015
    if ($last_space = strrpos($string, ' ')) { // space exists AND is not
on position 0
2418
      $string = substr($string, 0, $last_space);
2428
    }
243
    else {
24415
      $string = drupal_substr($string, 0, $len);
24514
    }
24614
  }
2478
  else {
2488
    $string = drupal_substr($string, 0, $len);
249
  }
2506
251
  if ($dots) {
25214
    $string .= ' ...';
253
  }
2541
255
  return $string;
256
}
25715
2588
/**
2598
 * Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded
260
 * characters.
26115
 *
2620
 * For example, mime_header_encode('t├ęst.txt') returns
"=?UTF-8?B?dMOpc3QudHh0?=".
263
 *
264
 * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.
265
 *
266
 * Notes:
267
 * - Only encode strings that contain non-ASCII characters.
268
 * - We progressively cut-off a chunk with truncate_utf8(). This is to
ensure
269
 *   each chunk starts and ends on a character boundary.
270
 * - Using \n as the chunk separator may cause problems on some systems and
may
271
 *   have to be changed to \r\n or \r.
272
 */
273
function mime_header_encode($string) {
274
  if (preg_match('/[^\x20-\x7E]/', $string)) {
275
    $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75);
276
    $len = strlen($string);
277
    $output = '';
278
    while ($len > 0) {
2792366
      $chunk = drupal_truncate_bytes($string, $chunk_size);
2809
      $output .= ' =?UTF-8?B?' . base64_encode($chunk) . "?=\n";
2810
      $c = strlen($chunk);
2820
      $string = substr($string, $c);
2830
      $len -= $c;
2840
    }
2850
    return trim($output);
2860
  }
2870
  return $string;
2880
}
2890
2900
/**
2910
 * Complement to mime_header_encode
2920
 */
2939
function mime_header_decode($header) {
2940
  // First step: encoded chunks followed by other encoded chunks (need to
collapse whitespace)
295
  $header =
preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/',
'_mime_header_decode', $header);
296
  // Second step: remaining chunks (do not collapse whitespace)
297
  return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/',
'_mime_header_decode', $header);
298
}
2992366
300
/**
3010
 * Helper function to mime_header_decode
302
 */
3030
function _mime_header_decode($matches) {
3040
  // Regexp groups:
305
  // 1: Character set name
306
  // 2: Escaping method (Q or B)
307
  // 3: Encoded data
308
  $data = ($matches[2] == 'B') ? base64_decode($matches[3]) :
str_replace('_', ' ', quoted_printable_decode($matches[3]));
3092366
  if (strtolower($matches[1]) != 'utf-8') {
310
    $data = drupal_convert_to_utf8($data, $matches[1]);
311
  }
312
  return $data;
313
}
3140
3150
/**
3160
 * Decode all HTML entities (including numerical ones) to regular UTF-8
bytes.
3170
 * Double-escaped entities will only be decoded once ("&amp;lt;" becomes
"&lt;", not "<").
3180
 *
3190
 * @param $text
320
 *   The text to decode entities in.
321
 * @param $exclude
322
 *   An array of characters which should not be decoded. For example,
323
 *   array('<', '&', '"'). This affects both named and numerical entities.
324
 */
325
function decode_entities($text, $exclude = array()) {
326
  static $table;
327
  // We store named entities in a table for quick processing.
328
  if (!isset($table)) {
329
    // Get all named HTML entities.
330
    $table = array_flip(get_html_translation_table(HTML_ENTITIES));
3312366
    // PHP gives us ISO-8859-1 data, we need UTF-8.
33220
    $table = array_map('utf8_encode', $table);
333
    // Add apostrophe (XML)
33420
    $table['&apos;'] = "'";
335
  }
33620
  $newtable = array_diff($table, $exclude);
337
33820
  // Use a regexp to select all entities in one pass, to avoid decoding
double-escaped entities twice.
339
  return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1",
"$2", "$0", $newtable, $exclude)', $text);
34020
}
34120
34220
/**
343
 * Helper function for decode_entities
344
 */
34520
function _decode_entities($prefix, $codepoint, $original, &$table,
&$exclude) {
3460
  // Named entity
347
  if (!$prefix) {
348
    if (isset($table[$original])) {
349
      return $table[$original];
350
    }
3512366
    else {
352
      return $original;
3530
    }
3540
  }
3550
  // Hexadecimal numerical entity
3560
  if ($prefix == '#x') {
357
    $codepoint = base_convert($codepoint, 16, 10);
3580
  }
359
  // Decimal numerical entity (strip leading zeros to avoid PHP octal
notation)
3600
  else {
361
    $codepoint = preg_replace('/^0+/', '', $codepoint);
3620
  }
3630
  // Encode codepoint as UTF-8 bytes
3640
  if ($codepoint < 0x80) {
365
    $str = chr($codepoint);
366
  }
3670
  elseif ($codepoint < 0x800) {
368
    $str = chr(0xC0 | ($codepoint >> 6))
369
         . chr(0x80 | ($codepoint & 0x3F));
3700
  }
3710
  elseif ($codepoint < 0x10000) {
3720
    $str = chr(0xE0 | ( $codepoint >> 12))
3730
         . chr(0x80 | (($codepoint >> 6) & 0x3F))
3740
         . chr(0x80 | ( $codepoint       & 0x3F));
3750
  }
3760
  elseif ($codepoint < 0x200000) {
3770
    $str = chr(0xF0 | ( $codepoint >> 18))
3780
         . chr(0x80 | (($codepoint >> 12) & 0x3F))
3790
         . chr(0x80 | (($codepoint >> 6)  & 0x3F))
3800
         . chr(0x80 | ( $codepoint        & 0x3F));
3810
  }
3820
  // Check for excluded characters
3830
  if (in_array($str, $exclude)) {
3840
    return $original;
3850
  }
3860
  else {
3870
    return $str;
388
  }
3890
}
3900
3910
/**
392
 * Count the amount of characters in a UTF-8 string. This is less than or
3930
 * equal to the byte count.
394
 */
3950
function drupal_strlen($text) {
396
  global $multibyte;
397
  if ($multibyte == UNICODE_MULTIBYTE) {
398
    return mb_strlen($text);
399
  }
400
  else {
4012366
    // Do not count UTF-8 continuation bytes.
402832
    return strlen(preg_replace("/[\x80-\xBF]/", '', $text));
403832
  }
404832
}
4050
406
/**
407
 * Uppercase a UTF-8 string.
4080
 */
409
function drupal_strtoupper($text) {
4100
  global $multibyte;
411
  if ($multibyte == UNICODE_MULTIBYTE) {
412
    return mb_strtoupper($text);
413
  }
414
  else {
4152366
    // Use C-locale for ASCII-only uppercase
41676
    $text = strtoupper($text);
41776
    // Case flip Latin-1 accented letters
41876
    $text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/',
'_unicode_caseflip', $text);
4190
    return $text;
420
  }
421
}
4220
423
/**
4240
 * Lowercase a UTF-8 string.
4250
 */
426
function drupal_strtolower($text) {
4270
  global $multibyte;
428
  if ($multibyte == UNICODE_MULTIBYTE) {
429
    return mb_strtolower($text);
430
  }
431
  else {
4322366
    // Use C-locale for ASCII-only lowercase
4331753
    $text = strtolower($text);
4341753
    // Case flip Latin-1 accented letters
4351753
    $text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/',
'_unicode_caseflip', $text);
4360
    return $text;
437
  }
438
}
4390
440
/**
4410
 * Helper function for case conversion of Latin-1.
4420
 * Used for flipping U+C0-U+DE to U+E0-U+FD and back.
443
 */
4440
function _unicode_caseflip($matches) {
445
  return $matches[0][0] . chr(ord($matches[0][1]) ^ 32);
446
}
447
448
/**
449
 * Capitalize the first letter of a UTF-8 string.
4502366
 */
4510
function drupal_ucfirst($text) {
4520
  // Note: no mbstring equivalent!
453
  return drupal_strtoupper(drupal_substr($text, 0, 1)) .
drupal_substr($text, 1);
454
}
455
456
/**
4572366
 * Cut off a piece of a string based on character indices and counts.
Follows
458
 * the same behavior as PHP's own substr() function.
45921
 *
4600
 * Note that for cutting off a string at a known character/substring
461
 * location, the usage of PHP's normal strpos/substr is safe and
462
 * much faster.
463
 */
464
function drupal_substr($text, $start, $length = NULL) {
465
  global $multibyte;
466
  if ($multibyte == UNICODE_MULTIBYTE) {
467
    return $length === NULL ? mb_substr($text, $start) : mb_substr($text,
$start, $length);
468
  }
469
  else {
4702366
    $strlen = strlen($text);
471154
    // Find the starting byte offset
472154
    $bytes = 0;
473154
    if ($start > 0) {
4740
      // Count all the continuation bytes from the start until we have
found
475
      // $start characters
4760
      $bytes = -1; $chars = -1;
477
      while ($bytes < $strlen && $chars < $start) {
4780
        $bytes++;
4790
        $c = ord($text[$bytes]);
480
        if ($c < 0x80 || $c >= 0xC0) {
481
          $chars++;
4820
        }
4830
      }
4840
    }
4850
    elseif ($start < 0) {
4860
      // Count all the continuation bytes from the end until we have found
4870
      // abs($start) characters
4880
      $start = abs($start);
4890
      $bytes = $strlen; $chars = 0;
4900
      while ($bytes > 0 && $chars < $start) {
4910
        $bytes--;
492
        $c = ord($text[$bytes]);
493
        if ($c < 0x80 || $c >= 0xC0) {
4940
          $chars++;
4950
        }
4960
      }
4970
    }
4980
    $istart = $bytes;
4990
5000
    // Find the ending byte offset
5010
    if ($length === NULL) {
5020
      $bytes = $strlen - 1;
5030
    }
5040
    elseif ($length > 0) {
505
      // Count all the continuation bytes from the starting index until we
have
506
      // found $length + 1 characters. Then backtrack one byte.
5070
      $bytes = $istart; $chars = 0;
5080
      while ($bytes < $strlen && $chars < $length) {
5090
        $bytes++;
5100
        $c = ord($text[$bytes]);
511
        if ($c < 0x80 || $c >= 0xC0) {
512
          $chars++;
5130
        }
5140
      }
5150
      $bytes--;
5160
    }
5170
    elseif ($length < 0) {
5180
      // Count all the continuation bytes from the end until we have found
5190
      // abs($length) characters
5200
      $length = abs($length);
5210
      $bytes = $strlen - 1; $chars = 0;
5220
      while ($bytes >= 0 && $chars < $length) {
5230
        $c = ord($text[$bytes]);
524
        if ($c < 0x80 || $c >= 0xC0) {
525
          $chars++;
5260
        }
5270
        $bytes--;
5280
      }
5290
    }
5300
    $iend = $bytes;
5310
5320
    return substr($text, $istart, max(0, $iend - $istart + 1));
5330
  }
5340
}
5350
5360
537