1: <?php
2: // {{{ license
3:
4: /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
5: //
6: // +----------------------------------------------------------------------+
7: // | This library is free software; you can redistribute it and/or modify |
8: // | it under the terms of the GNU Lesser General Public License as |
9: // | published by the Free Software Foundation; either version 2.1 of the |
10: // | License, or (at your option) any later version. |
11: // | |
12: // | This library is distributed in the hope that it will be useful, but |
13: // | WITHOUT ANY WARRANTY; without even the implied warranty of |
14: // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15: // | Lesser General Public License for more details. |
16: // | |
17: // | You should have received a copy of the GNU Lesser General Public |
18: // | License along with this library; if not, write to the Free Software |
19: // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
20: // | USA. |
21: // +----------------------------------------------------------------------+
22: //
23:
24: // }}}
25:
26: /**
27: * Encode/decode Internationalized Domain Names.
28: *
29: * The class allows to convert internationalized domain names
30: * (see RFC 3490 for details) as they can be used with various registries worldwide
31: * to be translated between their original (localized) form and their encoded form
32: * as it will be used in the DNS (Domain Name System).
33: *
34: * The class provides two public methods, encode() and decode(), which do exactly
35: * what you would expect them to do. You are allowed to use complete domain names,
36: * simple strings and complete email addresses as well. That means, that you might
37: * use any of the following notations:
38: *
39: * - www.nörgler.com
40: * - xn--nrgler-wxa
41: * - xn--brse-5qa.xn--knrz-1ra.info
42: *
43: * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
44: * array. Unicode output is available in the same formats.
45: * You can select your preferred format via {@link set_paramter()}.
46: *
47: * ACE input and output is always expected to be ASCII.
48: *
49: * @author Matthias Sommerfeld <mso@phlylabs.de>
50: * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
51: * @version 0.5.1
52: *
53: */
54: class idna_convert
55: {
56: /**
57: * Holds all relevant mapping tables, loaded from a seperate file on construct
58: * See RFC3454 for details
59: *
60: * @var array
61: * @access private
62: */
63: var $NP = array();
64:
65: // Internal settings, do not mess with them
66: var $_punycode_prefix = 'xn--';
67: var $_invalid_ucs = 0x80000000;
68: var $_max_ucs = 0x10FFFF;
69: var $_base = 36;
70: var $_tmin = 1;
71: var $_tmax = 26;
72: var $_skew = 38;
73: var $_damp = 700;
74: var $_initial_bias = 72;
75: var $_initial_n = 0x80;
76: var $_sbase = 0xAC00;
77: var $_lbase = 0x1100;
78: var $_vbase = 0x1161;
79: var $_tbase = 0x11A7;
80: var $_lcount = 19;
81: var $_vcount = 21;
82: var $_tcount = 28;
83: var $_ncount = 588; // _vcount * _tcount
84: var $_scount = 11172; // _lcount * _tcount * _vcount
85: var $_error = false;
86:
87: // See {@link set_paramter()} for details of how to change the following
88: // settings from within your script / application
89: var $_api_encoding = 'utf8'; // Default input charset is UTF-8
90: var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden
91: var $_strict_mode = false; // Behave strict or not
92:
93: // The constructor
94: function idna_convert($options = false)
95: {
96: $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
97: if (function_exists('file_get_contents')) {
98: $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser'));
99: } else {
100: $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser')));
101: }
102: // If parameters are given, pass these to the respective method
103: if (is_array($options)) {
104: return $this->set_parameter($options);
105: }
106: return true;
107: }
108:
109: /**
110: * Sets a new option value. Available options and values:
111: * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
112: * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
113: * [overlong - Unicode does not allow unnecessarily long encodings of chars,
114: * to allow this, set this parameter to true, else to false;
115: * default is false.]
116: * [strict - true: strict mode, good for registration purposes - Causes errors
117: * on failures; false: loose mode, ideal for "wildlife" applications
118: * by silently ignoring errors and returning the original input instead
119: *
120: * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
121: * @param string Value to use (if parameter 1 is a string)
122: * @return boolean true on success, false otherwise
123: * @access public
124: */
125: function set_parameter($option, $value = false)
126: {
127: if (!is_array($option)) {
128: $option = array($option => $value);
129: }
130: foreach ($option as $k => $v) {
131: switch ($k) {
132: case 'encoding':
133: switch ($v) {
134: case 'utf8':
135: case 'ucs4_string':
136: case 'ucs4_array':
137: $this->_api_encoding = $v;
138: break;
139: default:
140: $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
141: return false;
142: }
143: break;
144: case 'overlong':
145: $this->_allow_overlong = ($v) ? true : false;
146: break;
147: case 'strict':
148: $this->_strict_mode = ($v) ? true : false;
149: break;
150: default:
151: $this->_error('Set Parameter: Unknown option '.$k);
152: return false;
153: }
154: }
155: return true;
156: }
157:
158: /**
159: * Decode a given ACE domain name
160: * @param string Domain name (ACE string)
161: * [@param string Desired output encoding, see {@link set_parameter}]
162: * @return string Decoded Domain name (UTF-8 or UCS-4)
163: * @access public
164: */
165: function decode($input, $one_time_encoding = false)
166: {
167: // Optionally set
168: if ($one_time_encoding) {
169: switch ($one_time_encoding) {
170: case 'utf8':
171: case 'ucs4_string':
172: case 'ucs4_array':
173: break;
174: default:
175: $this->_error('Unknown encoding '.$one_time_encoding);
176: return false;
177: }
178: }
179: // Make sure to drop any newline characters around
180: $input = trim($input);
181:
182: // Negotiate input and try to determine, whether it is a plain string,
183: // an email address or something like a complete URL
184: if (strpos($input, '@')) { // Maybe it is an email address
185: // No no in strict mode
186: if ($this->_strict_mode) {
187: $this->_error('Only simple domain name parts can be handled in strict mode');
188: return false;
189: }
190: list ($email_pref, $input) = explode('@', $input, 2);
191: $arr = explode('.', $input);
192: foreach ($arr as $k => $v) {
193: if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
194: $conv = $this->_decode($v);
195: if ($conv) $arr[$k] = $conv;
196: }
197: }
198: $input = join('.', $arr);
199: $arr = explode('.', $email_pref);
200: foreach ($arr as $k => $v) {
201: if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
202: $conv = $this->_decode($v);
203: if ($conv) $arr[$k] = $conv;
204: }
205: }
206: $email_pref = join('.', $arr);
207: $return = $email_pref . '@' . $input;
208: } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
209: // No no in strict mode
210: if ($this->_strict_mode) {
211: $this->_error('Only simple domain name parts can be handled in strict mode');
212: return false;
213: }
214: $parsed = parse_url($input);
215: if (isset($parsed['host'])) {
216: $arr = explode('.', $parsed['host']);
217: foreach ($arr as $k => $v) {
218: $conv = $this->_decode($v);
219: if ($conv) $arr[$k] = $conv;
220: }
221: $parsed['host'] = join('.', $arr);
222: $return =
223: (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
224: .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
225: .$parsed['host']
226: .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
227: .(empty($parsed['path']) ? '' : $parsed['path'])
228: .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
229: .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
230: } else { // parse_url seems to have failed, try without it
231: $arr = explode('.', $input);
232: foreach ($arr as $k => $v) {
233: $conv = $this->_decode($v);
234: $arr[$k] = ($conv) ? $conv : $v;
235: }
236: $return = join('.', $arr);
237: }
238: } else { // Otherwise we consider it being a pure domain name string
239: $return = $this->_decode($input);
240: if (!$return) $return = $input;
241: }
242: // The output is UTF-8 by default, other output formats need conversion here
243: // If one time encoding is given, use this, else the objects property
244: switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
245: case 'utf8':
246: return $return;
247: break;
248: case 'ucs4_string':
249: return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
250: break;
251: case 'ucs4_array':
252: return $this->_utf8_to_ucs4($return);
253: break;
254: default:
255: $this->_error('Unsupported output format');
256: return false;
257: }
258: }
259:
260: /**
261: * Encode a given UTF-8 domain name
262: * @param string Domain name (UTF-8 or UCS-4)
263: * [@param string Desired input encoding, see {@link set_parameter}]
264: * @return string Encoded Domain name (ACE string)
265: * @access public
266: */
267: function encode($decoded, $one_time_encoding = false)
268: {
269: // Forcing conversion of input to UCS4 array
270: // If one time encoding is given, use this, else the objects property
271: switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
272: case 'utf8':
273: $decoded = $this->_utf8_to_ucs4($decoded);
274: break;
275: case 'ucs4_string':
276: $decoded = $this->_ucs4_string_to_ucs4($decoded);
277: case 'ucs4_array':
278: break;
279: default:
280: $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
281: return false;
282: }
283:
284: // No input, no output, what else did you expect?
285: if (empty($decoded)) return '';
286:
287: // Anchors for iteration
288: $last_begin = 0;
289: // Output string
290: $output = '';
291: foreach ($decoded as $k => $v) {
292: // Make sure to use just the plain dot
293: switch($v) {
294: case 0x3002:
295: case 0xFF0E:
296: case 0xFF61:
297: $decoded[$k] = 0x2E;
298: // Right, no break here, the above are converted to dots anyway
299: // Stumbling across an anchoring character
300: case 0x2E:
301: case 0x2F:
302: case 0x3A:
303: case 0x3F:
304: case 0x40:
305: // Neither email addresses nor URLs allowed in strict mode
306: if ($this->_strict_mode) {
307: $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
308: return false;
309: } else {
310: // Skip first char
311: if ($k) {
312: $encoded = '';
313: $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
314: if ($encoded) {
315: $output .= $encoded;
316: } else {
317: $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
318: }
319: $output .= chr($decoded[$k]);
320: }
321: $last_begin = $k + 1;
322: }
323: }
324: }
325: // Catch the rest of the string
326: if ($last_begin) {
327: $inp_len = sizeof($decoded);
328: $encoded = '';
329: $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
330: if ($encoded) {
331: $output .= $encoded;
332: } else {
333: $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
334: }
335: return $output;
336: } else {
337: if ($output = $this->_encode($decoded)) {
338: return $output;
339: } else {
340: return $this->_ucs4_to_utf8($decoded);
341: }
342: }
343: }
344:
345: /**
346: * Use this method to get the last error ocurred
347: * @param void
348: * @return string The last error, that occured
349: * @access public
350: */
351: function get_last_error()
352: {
353: return $this->_error;
354: }
355:
356: /**
357: * The actual decoding algorithm
358: * @access private
359: */
360: function _decode($encoded)
361: {
362: // We do need to find the Punycode prefix
363: if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
364: $this->_error('This is not a punycode string');
365: return false;
366: }
367: $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
368: // If nothing left after removing the prefix, it is hopeless
369: if (!$encode_test) {
370: $this->_error('The given encoded string was empty');
371: return false;
372: }
373: // Find last occurence of the delimiter
374: $delim_pos = strrpos($encoded, '-');
375: if ($delim_pos > strlen($this->_punycode_prefix)) {
376: for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
377: $decoded[] = ord($encoded{$k});
378: }
379: } else {
380: $decoded = array();
381: }
382: $deco_len = count($decoded);
383: $enco_len = strlen($encoded);
384:
385: // Wandering through the strings; init
386: $is_first = true;
387: $bias = $this->_initial_bias;
388: $idx = 0;
389: $char = $this->_initial_n;
390:
391: for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
392: for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
393: $digit = $this->_decode_digit($encoded{$enco_idx++});
394: $idx += $digit * $w;
395: $t = ($k <= $bias) ? $this->_tmin :
396: (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
397: if ($digit < $t) break;
398: $w = (int) ($w * ($this->_base - $t));
399: }
400: $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
401: $is_first = false;
402: $char += (int) ($idx / ($deco_len + 1));
403: $idx %= ($deco_len + 1);
404: if ($deco_len > 0) {
405: // Make room for the decoded char
406: for ($i = $deco_len; $i > $idx; $i--) {
407: $decoded[$i] = $decoded[($i - 1)];
408: }
409: }
410: $decoded[$idx++] = $char;
411: }
412: return $this->_ucs4_to_utf8($decoded);
413: }
414:
415: /**
416: * The actual encoding algorithm
417: * @access private
418: */
419: function _encode($decoded)
420: {
421: // We cannot encode a domain name containing the Punycode prefix
422: $extract = strlen($this->_punycode_prefix);
423: $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
424: $check_deco = array_slice($decoded, 0, $extract);
425:
426: if ($check_pref == $check_deco) {
427: $this->_error('This is already a punycode string');
428: return false;
429: }
430: // We will not try to encode strings consisting of basic code points only
431: $encodable = false;
432: foreach ($decoded as $k => $v) {
433: if ($v > 0x7a) {
434: $encodable = true;
435: break;
436: }
437: }
438: if (!$encodable) {
439: $this->_error('The given string does not contain encodable chars');
440: return false;
441: }
442:
443: // Do NAMEPREP
444: $decoded = $this->_nameprep($decoded);
445: if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
446:
447: $deco_len = count($decoded);
448: if (!$deco_len) return false; // Empty array
449:
450: $codecount = 0; // How many chars have been consumed
451:
452: $encoded = '';
453: // Copy all basic code points to output
454: for ($i = 0; $i < $deco_len; ++$i) {
455: $test = $decoded[$i];
456: // Will match [-0-9a-zA-Z]
457: if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
458: || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
459: $encoded .= chr($decoded[$i]);
460: $codecount++;
461: }
462: }
463: if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
464:
465: // Start with the prefix; copy it to output
466: $encoded = $this->_punycode_prefix.$encoded;
467:
468: // If we have basic code points in output, add an hyphen to the end
469: if ($codecount) $encoded .= '-';
470:
471: // Now find and encode all non-basic code points
472: $is_first = true;
473: $cur_code = $this->_initial_n;
474: $bias = $this->_initial_bias;
475: $delta = 0;
476: while ($codecount < $deco_len) {
477: // Find the smallest code point >= the current code point and
478: // remember the last ouccrence of it in the input
479: for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
480: if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
481: $next_code = $decoded[$i];
482: }
483: }
484:
485: $delta += ($next_code - $cur_code) * ($codecount + 1);
486: $cur_code = $next_code;
487:
488: // Scan input again and encode all characters whose code point is $cur_code
489: for ($i = 0; $i < $deco_len; $i++) {
490: if ($decoded[$i] < $cur_code) {
491: $delta++;
492: } elseif ($decoded[$i] == $cur_code) {
493: for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
494: $t = ($k <= $bias) ? $this->_tmin :
495: (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
496: if ($q < $t) break;
497: $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
498: $q = (int) (($q - $t) / ($this->_base - $t));
499: }
500: $encoded .= $this->_encode_digit($q);
501: $bias = $this->_adapt($delta, $codecount+1, $is_first);
502: $codecount++;
503: $delta = 0;
504: $is_first = false;
505: }
506: }
507: $delta++;
508: $cur_code++;
509: }
510: return $encoded;
511: }
512:
513: /**
514: * Adapt the bias according to the current code point and position
515: * @access private
516: */
517: function _adapt($delta, $npoints, $is_first)
518: {
519: $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
520: $delta += intval($delta / $npoints);
521: for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
522: $delta = intval($delta / ($this->_base - $this->_tmin));
523: }
524: return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
525: }
526:
527: /**
528: * Encoding a certain digit
529: * @access private
530: */
531: function _encode_digit($d)
532: {
533: return chr($d + 22 + 75 * ($d < 26));
534: }
535:
536: /**
537: * Decode a certain digit
538: * @access private
539: */
540: function _decode_digit($cp)
541: {
542: $cp = ord($cp);
543: return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
544: }
545:
546: /**
547: * Internal error handling method
548: * @access private
549: */
550: function _error($error = '')
551: {
552: $this->_error = $error;
553: }
554:
555: /**
556: * Do Nameprep according to RFC3491 and RFC3454
557: * @param array Unicode Characters
558: * @return string Unicode Characters, Nameprep'd
559: * @access private
560: */
561: function _nameprep($input)
562: {
563: $output = array();
564: $error = false;
565: //
566: // Mapping
567: // Walking through the input array, performing the required steps on each of
568: // the input chars and putting the result into the output array
569: // While mapping required chars we apply the cannonical ordering
570: foreach ($input as $v) {
571: // Map to nothing == skip that code point
572: if (in_array($v, $this->NP['map_nothing'])) continue;
573:
574: // Try to find prohibited input
575: if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
576: $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
577: return false;
578: }
579: foreach ($this->NP['prohibit_ranges'] as $range) {
580: if ($range[0] <= $v && $v <= $range[1]) {
581: $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
582: return false;
583: }
584: }
585: //
586: // Hangul syllable decomposition
587: if (0xAC00 <= $v && $v <= 0xD7AF) {
588: foreach ($this->_hangul_decompose($v) as $out) {
589: $output[] = (int) $out;
590: }
591: // There's a decomposition mapping for that code point
592: } elseif (isset($this->NP['replacemaps'][$v])) {
593: foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
594: $output[] = (int) $out;
595: }
596: } else {
597: $output[] = (int) $v;
598: }
599: }
600: // Before applying any Combining, try to rearrange any Hangul syllables
601: $output = $this->_hangul_compose($output);
602: //
603: // Combine code points
604: //
605: $last_class = 0;
606: $last_starter = 0;
607: $out_len = count($output);
608: for ($i = 0; $i < $out_len; ++$i) {
609: $class = $this->_get_combining_class($output[$i]);
610: if ((!$last_class || $last_class > $class) && $class) {
611: // Try to match
612: $seq_len = $i - $last_starter;
613: $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
614: // On match: Replace the last starter with the composed character and remove
615: // the now redundant non-starter(s)
616: if ($out) {
617: $output[$last_starter] = $out;
618: if (count($out) != $seq_len) {
619: for ($j = $i+1; $j < $out_len; ++$j) {
620: $output[$j-1] = $output[$j];
621: }
622: unset($output[$out_len]);
623: }
624: // Rewind the for loop by one, since there can be more possible compositions
625: $i--;
626: $out_len--;
627: $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
628: continue;
629: }
630: }
631: // The current class is 0
632: if (!$class) $last_starter = $i;
633: $last_class = $class;
634: }
635: return $output;
636: }
637:
638: /**
639: * Decomposes a Hangul syllable
640: * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
641: * @param integer 32bit UCS4 code point
642: * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
643: * @access private
644: */
645: function _hangul_decompose($char)
646: {
647: $sindex = (int) $char - $this->_sbase;
648: if ($sindex < 0 || $sindex >= $this->_scount) {
649: return array($char);
650: }
651: $result = array();
652: $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
653: $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
654: $T = intval($this->_tbase + $sindex % $this->_tcount);
655: if ($T != $this->_tbase) $result[] = $T;
656: return $result;
657: }
658: /**
659: * Ccomposes a Hangul syllable
660: * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
661: * @param array Decomposed UCS4 sequence
662: * @return array UCS4 sequence with syllables composed
663: * @access private
664: */
665: function _hangul_compose($input)
666: {
667: $inp_len = count($input);
668: if (!$inp_len) return array();
669: $result = array();
670: $last = (int) $input[0];
671: $result[] = $last; // copy first char from input to output
672:
673: for ($i = 1; $i < $inp_len; ++$i) {
674: $char = (int) $input[$i];
675: $sindex = $last - $this->_sbase;
676: $lindex = $last - $this->_lbase;
677: $vindex = $char - $this->_vbase;
678: $tindex = $char - $this->_tbase;
679: // Find out, whether two current characters are LV and T
680: if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
681: && 0 <= $tindex && $tindex <= $this->_tcount) {
682: // create syllable of form LVT
683: $last += $tindex;
684: $result[(count($result) - 1)] = $last; // reset last
685: continue; // discard char
686: }
687: // Find out, whether two current characters form L and V
688: if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
689: // create syllable of form LV
690: $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
691: $result[(count($result) - 1)] = $last; // reset last
692: continue; // discard char
693: }
694: // if neither case was true, just add the character
695: $last = $char;
696: $result[] = $char;
697: }
698: return $result;
699: }
700:
701: /**
702: * Returns the combining class of a certain wide char
703: * @param integer Wide char to check (32bit integer)
704: * @return integer Combining class if found, else 0
705: * @access private
706: */
707: function _get_combining_class($char)
708: {
709: return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
710: }
711:
712: /**
713: * Apllies the cannonical ordering of a decomposed UCS4 sequence
714: * @param array Decomposed UCS4 sequence
715: * @return array Ordered USC4 sequence
716: * @access private
717: */
718: function _apply_cannonical_ordering($input)
719: {
720: $swap = true;
721: $size = count($input);
722: while ($swap) {
723: $swap = false;
724: $last = $this->_get_combining_class(intval($input[0]));
725: for ($i = 0; $i < $size-1; ++$i) {
726: $next = $this->_get_combining_class(intval($input[$i+1]));
727: if ($next != 0 && $last > $next) {
728: // Move item leftward until it fits
729: for ($j = $i + 1; $j > 0; --$j) {
730: if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
731: $t = intval($input[$j]);
732: $input[$j] = intval($input[$j-1]);
733: $input[$j-1] = $t;
734: $swap = true;
735: }
736: // Reentering the loop looking at the old character again
737: $next = $last;
738: }
739: $last = $next;
740: }
741: }
742: return $input;
743: }
744:
745: /**
746: * Do composition of a sequence of starter and non-starter
747: * @param array UCS4 Decomposed sequence
748: * @return array Ordered USC4 sequence
749: * @access private
750: */
751: function _combine($input)
752: {
753: $inp_len = count($input);
754: foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
755: if ($np_target[0] != $input[0]) continue;
756: if (count($np_target) != $inp_len) continue;
757: $hit = false;
758: foreach ($input as $k2 => $v2) {
759: if ($v2 == $np_target[$k2]) {
760: $hit = true;
761: } else {
762: $hit = false;
763: break;
764: }
765: }
766: if ($hit) return $np_src;
767: }
768: return false;
769: }
770:
771: /**
772: * This converts an UTF-8 encoded string to its UCS-4 representation
773: * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
774: * each of the "chars". This is due to PHP not being able to handle strings with
775: * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
776: * The following UTF-8 encodings are supported:
777: * bytes bits representation
778: * 1 7 0xxxxxxx
779: * 2 11 110xxxxx 10xxxxxx
780: * 3 16 1110xxxx 10xxxxxx 10xxxxxx
781: * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
782: * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
783: * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
784: * Each x represents a bit that can be used to store character data.
785: * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
786: * @access private
787: */
788: function _utf8_to_ucs4($input)
789: {
790: $output = array();
791: $out_len = 0;
792: $inp_len = strlen($input);
793: $mode = 'next';
794: $test = 'none';
795: for ($k = 0; $k < $inp_len; ++$k) {
796: $v = ord($input{$k}); // Extract byte from input string
797:
798: if ($v < 128) { // We found an ASCII char - put into stirng as is
799: $output[$out_len] = $v;
800: ++$out_len;
801: if ('add' == $mode) {
802: $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
803: return false;
804: }
805: continue;
806: }
807: if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
808: $start_byte = $v;
809: $mode = 'add';
810: $test = 'range';
811: if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
812: $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
813: $v = ($v - 192) << 6;
814: } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
815: $next_byte = 1;
816: $v = ($v - 224) << 12;
817: } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
818: $next_byte = 2;
819: $v = ($v - 240) << 18;
820: } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
821: $next_byte = 3;
822: $v = ($v - 248) << 24;
823: } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
824: $next_byte = 4;
825: $v = ($v - 252) << 30;
826: } else {
827: $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
828: return false;
829: }
830: if ('add' == $mode) {
831: $output[$out_len] = (int) $v;
832: ++$out_len;
833: continue;
834: }
835: }
836: if ('add' == $mode) {
837: if (!$this->_allow_overlong && $test == 'range') {
838: $test = 'none';
839: if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
840: $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
841: return false;
842: }
843: }
844: if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
845: $v = ($v - 128) << ($next_byte * 6);
846: $output[($out_len - 1)] += $v;
847: --$next_byte;
848: } else {
849: $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
850: return false;
851: }
852: if ($next_byte < 0) {
853: $mode = 'next';
854: }
855: }
856: } // for
857: return $output;
858: }
859:
860: /**
861: * Convert UCS-4 string into UTF-8 string
862: * See _utf8_to_ucs4() for details
863: * @access private
864: */
865: function _ucs4_to_utf8($input)
866: {
867: $output = '';
868: $k = 0;
869: foreach ($input as $v) {
870: ++$k;
871: // $v = ord($v);
872: if ($v < 128) { // 7bit are transferred literally
873: $output .= chr($v);
874: } elseif ($v < (1 << 11)) { // 2 bytes
875: $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
876: } elseif ($v < (1 << 16)) { // 3 bytes
877: $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
878: } elseif ($v < (1 << 21)) { // 4 bytes
879: $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
880: . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
881: } elseif ($v < (1 << 26)) { // 5 bytes
882: $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
883: . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
884: . chr(128 + ($v & 63));
885: } elseif ($v < (1 << 31)) { // 6 bytes
886: $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
887: . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
888: . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
889: } else {
890: $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
891: return false;
892: }
893: }
894: return $output;
895: }
896:
897: /**
898: * Convert UCS-4 array into UCS-4 string
899: *
900: * @access private
901: */
902: function _ucs4_to_ucs4_string($input)
903: {
904: $output = '';
905: // Take array values and split output to 4 bytes per value
906: // The bit mask is 255, which reads &11111111
907: foreach ($input as $v) {
908: $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
909: }
910: return $output;
911: }
912:
913: /**
914: * Convert UCS-4 strin into UCS-4 garray
915: *
916: * @access private
917: */
918: function _ucs4_string_to_ucs4($input)
919: {
920: $output = array();
921: $inp_len = strlen($input);
922: // Input length must be dividable by 4
923: if ($inp_len % 4) {
924: $this->_error('Input UCS4 string is broken');
925: return false;
926: }
927: // Empty input - return empty output
928: if (!$inp_len) return $output;
929: for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
930: // Increment output position every 4 input bytes
931: if (!($i % 4)) {
932: $out_len++;
933: $output[$out_len] = 0;
934: }
935: $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
936: }
937: return $output;
938: }
939: }
940:
941: /**
942: * Adapter class for aligning the API of idna_convert with that of Net_IDNA
943: * @author Matthias Sommerfeld <mso@phlylabs.de>
944: */
945: class Net_IDNA_php4 extends idna_convert
946: {
947: /**
948: * Sets a new option value. Available options and values:
949: * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
950: * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
951: * [overlong - Unicode does not allow unnecessarily long encodings of chars,
952: * to allow this, set this parameter to true, else to false;
953: * default is false.]
954: * [strict - true: strict mode, good for registration purposes - Causes errors
955: * on failures; false: loose mode, ideal for "wildlife" applications
956: * by silently ignoring errors and returning the original input instead
957: *
958: * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
959: * @param string Value to use (if parameter 1 is a string)
960: * @return boolean true on success, false otherwise
961: * @access public
962: */
963: function setParams($option, $param = false)
964: {
965: return $this->IC->set_parameters($option, $param);
966: }
967: }
968:
969: ?>