File idn/idna_convert.class.php | SimplePie Documentation

  1: <?php
  2: // {{{ license
  3: 
  4: /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
  5: //
  6: // +----------------------------------------------------------------------+
  7: // | This library is free software; you can redistribute it and/or modify |
  8: // | it under the terms of the GNU Lesser General Public License as       |
  9: // | published by the Free Software Foundation; either version 2.1 of the |
 10: // | License, or (at your option) any later version.                      |
 11: // |                                                                      |
 12: // | This library is distributed in the hope that it will be useful, but  |
 13: // | WITHOUT ANY WARRANTY; without even the implied warranty of           |
 14: // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    |
 15: // | Lesser General Public License for more details.                      |
 16: // |                                                                      |
 17: // | You should have received a copy of the GNU Lesser General Public     |
 18: // | License along with this library; if not, write to the Free Software  |
 19: // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 |
 20: // | USA.                                                                 |
 21: // +----------------------------------------------------------------------+
 22: //
 23: 
 24: // }}}
 25: 
 26: /**
 27:  * Encode/decode Internationalized Domain Names.
 28:  *
 29:  * The class allows to convert internationalized domain names
 30:  * (see RFC 3490 for details) as they can be used with various registries worldwide
 31:  * to be translated between their original (localized) form and their encoded form
 32:  * as it will be used in the DNS (Domain Name System).
 33:  *
 34:  * The class provides two public methods, encode() and decode(), which do exactly
 35:  * what you would expect them to do. You are allowed to use complete domain names,
 36:  * simple strings and complete email addresses as well. That means, that you might
 37:  * use any of the following notations:
 38:  *
 39:  * - www.nörgler.com
 40:  * - xn--nrgler-wxa
 41:  * - xn--brse-5qa.xn--knrz-1ra.info
 42:  *
 43:  * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
 44:  * array. Unicode output is available in the same formats.
 45:  * You can select your preferred format via {@link set_paramter()}.
 46:  *
 47:  * ACE input and output is always expected to be ASCII.
 48:  *
 49:  * @author  Matthias Sommerfeld <mso@phlylabs.de>
 50:  * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
 51:  * @version 0.5.1
 52:  *
 53:  */
 54: class idna_convert
 55: {
 56:     /**
 57:      * Holds all relevant mapping tables, loaded from a seperate file on construct
 58:      * See RFC3454 for details
 59:      *
 60:      * @var array
 61:      * @access private
 62:      */
 63:     var $NP = array();
 64: 
 65:     // Internal settings, do not mess with them
 66:     var $_punycode_prefix = 'xn--';
 67:     var $_invalid_ucs =     0x80000000;
 68:     var $_max_ucs =         0x10FFFF;
 69:     var $_base =            36;
 70:     var $_tmin =            1;
 71:     var $_tmax =            26;
 72:     var $_skew =            38;
 73:     var $_damp =            700;
 74:     var $_initial_bias =    72;
 75:     var $_initial_n =       0x80;
 76:     var $_sbase =           0xAC00;
 77:     var $_lbase =           0x1100;
 78:     var $_vbase =           0x1161;
 79:     var $_tbase =           0x11A7;
 80:     var $_lcount =          19;
 81:     var $_vcount =          21;
 82:     var $_tcount =          28;
 83:     var $_ncount =          588;   // _vcount * _tcount
 84:     var $_scount =          11172; // _lcount * _tcount * _vcount
 85:     var $_error =           false;
 86: 
 87:     // See {@link set_paramter()} for details of how to change the following
 88:     // settings from within your script / application
 89:     var $_api_encoding   =  'utf8'; // Default input charset is UTF-8
 90:     var $_allow_overlong =  false;  // Overlong UTF-8 encodings are forbidden
 91:     var $_strict_mode    =  false;  // Behave strict or not
 92: 
 93:     // The constructor
 94:     function idna_convert($options = false)
 95:     {
 96:         $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
 97:         if (function_exists('file_get_contents')) {
 98:             $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser'));
 99:         } else {
100:             $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser')));
101:         }
102:         // If parameters are given, pass these to the respective method
103:         if (is_array($options)) {
104:             return $this->set_parameter($options);
105:         }
106:         return true;
107:     }
108: 
109:     /**
110:      * Sets a new option value. Available options and values:
111:      * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
112:      *         'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
113:      * [overlong - Unicode does not allow unnecessarily long encodings of chars,
114:      *             to allow this, set this parameter to true, else to false;
115:      *             default is false.]
116:      * [strict - true: strict mode, good for registration purposes - Causes errors
117:      *           on failures; false: loose mode, ideal for "wildlife" applications
118:      *           by silently ignoring errors and returning the original input instead
119:      *
120:      * @param    mixed     Parameter to set (string: single parameter; array of Parameter => Value pairs)
121:      * @param    string    Value to use (if parameter 1 is a string)
122:      * @return   boolean   true on success, false otherwise
123:      * @access   public
124:      */
125:     function set_parameter($option, $value = false)
126:     {
127:         if (!is_array($option)) {
128:             $option = array($option => $value);
129:         }
130:         foreach ($option as $k => $v) {
131:             switch ($k) {
132:             case 'encoding':
133:                 switch ($v) {
134:                 case 'utf8':
135:                 case 'ucs4_string':
136:                 case 'ucs4_array':
137:                     $this->_api_encoding = $v;
138:                     break;
139:                 default:
140:                     $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
141:                     return false;
142:                 }
143:                 break;
144:             case 'overlong':
145:                 $this->_allow_overlong = ($v) ? true : false;
146:                 break;
147:             case 'strict':
148:                 $this->_strict_mode = ($v) ? true : false;
149:                 break;
150:             default:
151:                 $this->_error('Set Parameter: Unknown option '.$k);
152:                 return false;
153:             }
154:         }
155:         return true;
156:     }
157: 
158:     /**
159:      * Decode a given ACE domain name
160:      * @param    string   Domain name (ACE string)
161:      * [@param    string   Desired output encoding, see {@link set_parameter}]
162:      * @return   string   Decoded Domain name (UTF-8 or UCS-4)
163:      * @access   public
164:      */
165:     function decode($input, $one_time_encoding = false)
166:     {
167:         // Optionally set
168:         if ($one_time_encoding) {
169:             switch ($one_time_encoding) {
170:             case 'utf8':
171:             case 'ucs4_string':
172:             case 'ucs4_array':
173:                 break;
174:             default:
175:                 $this->_error('Unknown encoding '.$one_time_encoding);
176:                 return false;
177:             }
178:         }
179:         // Make sure to drop any newline characters around
180:         $input = trim($input);
181: 
182:         // Negotiate input and try to determine, whether it is a plain string,
183:         // an email address or something like a complete URL
184:         if (strpos($input, '@')) { // Maybe it is an email address
185:             // No no in strict mode
186:             if ($this->_strict_mode) {
187:                 $this->_error('Only simple domain name parts can be handled in strict mode');
188:                 return false;
189:             }
190:             list ($email_pref, $input) = explode('@', $input, 2);
191:             $arr = explode('.', $input);
192:             foreach ($arr as $k => $v) {
193:                 if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
194:                     $conv = $this->_decode($v);
195:                     if ($conv) $arr[$k] = $conv;
196:                 }
197:             }
198:             $input = join('.', $arr);
199:             $arr = explode('.', $email_pref);
200:             foreach ($arr as $k => $v) {
201:                 if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
202:                     $conv = $this->_decode($v);
203:                     if ($conv) $arr[$k] = $conv;
204:                 }
205:             }
206:             $email_pref = join('.', $arr);
207:             $return = $email_pref . '@' . $input;
208:         } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
209:             // No no in strict mode
210:             if ($this->_strict_mode) {
211:                 $this->_error('Only simple domain name parts can be handled in strict mode');
212:                 return false;
213:             }
214:             $parsed = parse_url($input);
215:             if (isset($parsed['host'])) {
216:                 $arr = explode('.', $parsed['host']);
217:                 foreach ($arr as $k => $v) {
218:                     $conv = $this->_decode($v);
219:                     if ($conv) $arr[$k] = $conv;
220:                 }
221:                 $parsed['host'] = join('.', $arr);
222:                 $return =
223:                         (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
224:                         .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
225:                         .$parsed['host']
226:                         .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
227:                         .(empty($parsed['path']) ? '' : $parsed['path'])
228:                         .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
229:                         .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
230:             } else { // parse_url seems to have failed, try without it
231:                 $arr = explode('.', $input);
232:                 foreach ($arr as $k => $v) {
233:                     $conv = $this->_decode($v);
234:                     $arr[$k] = ($conv) ? $conv : $v;
235:                 }
236:                 $return = join('.', $arr);
237:             }
238:         } else { // Otherwise we consider it being a pure domain name string
239:             $return = $this->_decode($input);
240:             if (!$return) $return = $input;
241:         }
242:         // The output is UTF-8 by default, other output formats need conversion here
243:         // If one time encoding is given, use this, else the objects property
244:         switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
245:         case 'utf8':
246:             return $return;
247:             break;
248:         case 'ucs4_string':
249:            return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
250:            break;
251:         case 'ucs4_array':
252:             return $this->_utf8_to_ucs4($return);
253:             break;
254:         default:
255:             $this->_error('Unsupported output format');
256:             return false;
257:         }
258:     }
259: 
260:     /**
261:      * Encode a given UTF-8 domain name
262:      * @param    string   Domain name (UTF-8 or UCS-4)
263:      * [@param    string   Desired input encoding, see {@link set_parameter}]
264:      * @return   string   Encoded Domain name (ACE string)
265:      * @access   public
266:      */
267:     function encode($decoded, $one_time_encoding = false)
268:     {
269:         // Forcing conversion of input to UCS4 array
270:         // If one time encoding is given, use this, else the objects property
271:         switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
272:         case 'utf8':
273:             $decoded = $this->_utf8_to_ucs4($decoded);
274:             break;
275:         case 'ucs4_string':
276:            $decoded = $this->_ucs4_string_to_ucs4($decoded);
277:         case 'ucs4_array':
278:            break;
279:         default:
280:             $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
281:             return false;
282:         }
283: 
284:         // No input, no output, what else did you expect?
285:         if (empty($decoded)) return '';
286: 
287:         // Anchors for iteration
288:         $last_begin = 0;
289:         // Output string
290:         $output = '';
291:         foreach ($decoded as $k => $v) {
292:             // Make sure to use just the plain dot
293:             switch($v) {
294:             case 0x3002:
295:             case 0xFF0E:
296:             case 0xFF61:
297:                 $decoded[$k] = 0x2E;
298:                 // Right, no break here, the above are converted to dots anyway
299:             // Stumbling across an anchoring character
300:             case 0x2E:
301:             case 0x2F:
302:             case 0x3A:
303:             case 0x3F:
304:             case 0x40:
305:                 // Neither email addresses nor URLs allowed in strict mode
306:                 if ($this->_strict_mode) {
307:                    $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
308:                    return false;
309:                 } else {
310:                     // Skip first char
311:                     if ($k) {
312:                         $encoded = '';
313:                         $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
314:                         if ($encoded) {
315:                             $output .= $encoded;
316:                         } else {
317:                             $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
318:                         }
319:                         $output .= chr($decoded[$k]);
320:                     }
321:                     $last_begin = $k + 1;
322:                 }
323:             }
324:         }
325:         // Catch the rest of the string
326:         if ($last_begin) {
327:             $inp_len = sizeof($decoded);
328:             $encoded = '';
329:             $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
330:             if ($encoded) {
331:                 $output .= $encoded;
332:             } else {
333:                 $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
334:             }
335:             return $output;
336:         } else {
337:             if ($output = $this->_encode($decoded)) {
338:                 return $output;
339:             } else {
340:                 return $this->_ucs4_to_utf8($decoded);
341:             }
342:         }
343:     }
344: 
345:     /**
346:      * Use this method to get the last error ocurred
347:      * @param    void
348:      * @return   string   The last error, that occured
349:      * @access   public
350:      */
351:     function get_last_error()
352:     {
353:         return $this->_error;
354:     }
355: 
356:     /**
357:      * The actual decoding algorithm
358:      * @access   private
359:      */
360:     function _decode($encoded)
361:     {
362:         // We do need to find the Punycode prefix
363:         if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
364:             $this->_error('This is not a punycode string');
365:             return false;
366:         }
367:         $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
368:         // If nothing left after removing the prefix, it is hopeless
369:         if (!$encode_test) {
370:             $this->_error('The given encoded string was empty');
371:             return false;
372:         }
373:         // Find last occurence of the delimiter
374:         $delim_pos = strrpos($encoded, '-');
375:         if ($delim_pos > strlen($this->_punycode_prefix)) {
376:             for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
377:                 $decoded[] = ord($encoded{$k});
378:             }
379:         } else {
380:             $decoded = array();
381:         }
382:         $deco_len = count($decoded);
383:         $enco_len = strlen($encoded);
384: 
385:         // Wandering through the strings; init
386:         $is_first = true;
387:         $bias     = $this->_initial_bias;
388:         $idx      = 0;
389:         $char     = $this->_initial_n;
390: 
391:         for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
392:             for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
393:                 $digit = $this->_decode_digit($encoded{$enco_idx++});
394:                 $idx += $digit * $w;
395:                 $t = ($k <= $bias) ? $this->_tmin :
396:                         (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
397:                 if ($digit < $t) break;
398:                 $w = (int) ($w * ($this->_base - $t));
399:             }
400:             $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
401:             $is_first = false;
402:             $char += (int) ($idx / ($deco_len + 1));
403:             $idx %= ($deco_len + 1);
404:             if ($deco_len > 0) {
405:                 // Make room for the decoded char
406:                 for ($i = $deco_len; $i > $idx; $i--) {
407:                     $decoded[$i] = $decoded[($i - 1)];
408:                 }
409:             }
410:             $decoded[$idx++] = $char;
411:         }
412:         return $this->_ucs4_to_utf8($decoded);
413:     }
414: 
415:     /**
416:      * The actual encoding algorithm
417:      * @access   private
418:      */
419:     function _encode($decoded)
420:     {
421:         // We cannot encode a domain name containing the Punycode prefix
422:         $extract = strlen($this->_punycode_prefix);
423:         $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
424:         $check_deco = array_slice($decoded, 0, $extract);
425: 
426:         if ($check_pref == $check_deco) {
427:             $this->_error('This is already a punycode string');
428:             return false;
429:         }
430:         // We will not try to encode strings consisting of basic code points only
431:         $encodable = false;
432:         foreach ($decoded as $k => $v) {
433:             if ($v > 0x7a) {
434:                 $encodable = true;
435:                 break;
436:             }
437:         }
438:         if (!$encodable) {
439:             $this->_error('The given string does not contain encodable chars');
440:             return false;
441:         }
442: 
443:         // Do NAMEPREP
444:         $decoded = $this->_nameprep($decoded);
445:         if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
446: 
447:         $deco_len  = count($decoded);
448:         if (!$deco_len) return false; // Empty array
449: 
450:         $codecount = 0; // How many chars have been consumed
451: 
452:         $encoded = '';
453:         // Copy all basic code points to output
454:         for ($i = 0; $i < $deco_len; ++$i) {
455:             $test = $decoded[$i];
456:             // Will match [-0-9a-zA-Z]
457:             if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
458:                     || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
459:                 $encoded .= chr($decoded[$i]);
460:                 $codecount++;
461:             }
462:         }
463:         if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
464: 
465:         // Start with the prefix; copy it to output
466:         $encoded = $this->_punycode_prefix.$encoded;
467: 
468:         // If we have basic code points in output, add an hyphen to the end
469:         if ($codecount) $encoded .= '-';
470: 
471:         // Now find and encode all non-basic code points
472:         $is_first  = true;
473:         $cur_code  = $this->_initial_n;
474:         $bias      = $this->_initial_bias;
475:         $delta     = 0;
476:         while ($codecount < $deco_len) {
477:             // Find the smallest code point >= the current code point and
478:             // remember the last ouccrence of it in the input
479:             for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
480:                 if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
481:                     $next_code = $decoded[$i];
482:                 }
483:             }
484: 
485:             $delta += ($next_code - $cur_code) * ($codecount + 1);
486:             $cur_code = $next_code;
487: 
488:             // Scan input again and encode all characters whose code point is $cur_code
489:             for ($i = 0; $i < $deco_len; $i++) {
490:                 if ($decoded[$i] < $cur_code) {
491:                     $delta++;
492:                 } elseif ($decoded[$i] == $cur_code) {
493:                     for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
494:                         $t = ($k <= $bias) ? $this->_tmin :
495:                                 (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
496:                         if ($q < $t) break;
497:                         $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
498:                         $q = (int) (($q - $t) / ($this->_base - $t));
499:                     }
500:                     $encoded .= $this->_encode_digit($q);
501:                     $bias = $this->_adapt($delta, $codecount+1, $is_first);
502:                     $codecount++;
503:                     $delta = 0;
504:                     $is_first = false;
505:                 }
506:             }
507:             $delta++;
508:             $cur_code++;
509:         }
510:         return $encoded;
511:     }
512: 
513:     /**
514:      * Adapt the bias according to the current code point and position
515:      * @access   private
516:      */
517:     function _adapt($delta, $npoints, $is_first)
518:     {
519:         $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
520:         $delta += intval($delta / $npoints);
521:         for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
522:             $delta = intval($delta / ($this->_base - $this->_tmin));
523:         }
524:         return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
525:     }
526: 
527:     /**
528:      * Encoding a certain digit
529:      * @access   private
530:      */
531:     function _encode_digit($d)
532:     {
533:         return chr($d + 22 + 75 * ($d < 26));
534:     }
535: 
536:     /**
537:      * Decode a certain digit
538:      * @access   private
539:      */
540:     function _decode_digit($cp)
541:     {
542:         $cp = ord($cp);
543:         return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
544:     }
545: 
546:     /**
547:      * Internal error handling method
548:      * @access   private
549:      */
550:     function _error($error = '')
551:     {
552:         $this->_error = $error;
553:     }
554: 
555:     /**
556:      * Do Nameprep according to RFC3491 and RFC3454
557:      * @param    array    Unicode Characters
558:      * @return   string   Unicode Characters, Nameprep'd
559:      * @access   private
560:      */
561:     function _nameprep($input)
562:     {
563:         $output = array();
564:         $error = false;
565:         //
566:         // Mapping
567:         // Walking through the input array, performing the required steps on each of
568:         // the input chars and putting the result into the output array
569:         // While mapping required chars we apply the cannonical ordering
570:         foreach ($input as $v) {
571:             // Map to nothing == skip that code point
572:             if (in_array($v, $this->NP['map_nothing'])) continue;
573: 
574:             // Try to find prohibited input
575:             if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
576:                 $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
577:                 return false;
578:             }
579:             foreach ($this->NP['prohibit_ranges'] as $range) {
580:                 if ($range[0] <= $v && $v <= $range[1]) {
581:                     $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
582:                     return false;
583:                 }
584:             }
585:             //
586:             // Hangul syllable decomposition
587:             if (0xAC00 <= $v && $v <= 0xD7AF) {
588:                 foreach ($this->_hangul_decompose($v) as $out) {
589:                     $output[] = (int) $out;
590:                 }
591:             // There's a decomposition mapping for that code point
592:             } elseif (isset($this->NP['replacemaps'][$v])) {
593:                 foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
594:                     $output[] = (int) $out;
595:                 }
596:             } else {
597:                 $output[] = (int) $v;
598:             }
599:         }
600:         // Before applying any Combining, try to rearrange any Hangul syllables
601:         $output = $this->_hangul_compose($output);
602:         //
603:         // Combine code points
604:         //
605:         $last_class   = 0;
606:         $last_starter = 0;
607:         $out_len      = count($output);
608:         for ($i = 0; $i < $out_len; ++$i) {
609:             $class = $this->_get_combining_class($output[$i]);
610:             if ((!$last_class || $last_class > $class) && $class) {
611:                 // Try to match
612:                 $seq_len = $i - $last_starter;
613:                 $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
614:                 // On match: Replace the last starter with the composed character and remove
615:                 // the now redundant non-starter(s)
616:                 if ($out) {
617:                     $output[$last_starter] = $out;
618:                     if (count($out) != $seq_len) {
619:                         for ($j = $i+1; $j < $out_len; ++$j) {
620:                             $output[$j-1] = $output[$j];
621:                         }
622:                         unset($output[$out_len]);
623:                     }
624:                     // Rewind the for loop by one, since there can be more possible compositions
625:                     $i--;
626:                     $out_len--;
627:                     $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
628:                     continue;
629:                 }
630:             }
631:             // The current class is 0
632:             if (!$class) $last_starter = $i;
633:             $last_class = $class;
634:         }
635:         return $output;
636:     }
637: 
638:     /**
639:      * Decomposes a Hangul syllable
640:      * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
641:      * @param    integer  32bit UCS4 code point
642:      * @return   array    Either Hangul Syllable decomposed or original 32bit value as one value array
643:      * @access   private
644:      */
645:     function _hangul_decompose($char)
646:     {
647:         $sindex = (int) $char - $this->_sbase;
648:         if ($sindex < 0 || $sindex >= $this->_scount) {
649:             return array($char);
650:         }
651:         $result = array();
652:         $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
653:         $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
654:         $T = intval($this->_tbase + $sindex % $this->_tcount);
655:         if ($T != $this->_tbase) $result[] = $T;
656:         return $result;
657:     }
658:     /**
659:      * Ccomposes a Hangul syllable
660:      * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
661:      * @param    array    Decomposed UCS4 sequence
662:      * @return   array    UCS4 sequence with syllables composed
663:      * @access   private
664:      */
665:     function _hangul_compose($input)
666:     {
667:         $inp_len = count($input);
668:         if (!$inp_len) return array();
669:         $result = array();
670:         $last = (int) $input[0];
671:         $result[] = $last; // copy first char from input to output
672: 
673:         for ($i = 1; $i < $inp_len; ++$i) {
674:             $char = (int) $input[$i];
675:             $sindex = $last - $this->_sbase;
676:             $lindex = $last - $this->_lbase;
677:             $vindex = $char - $this->_vbase;
678:             $tindex = $char - $this->_tbase;
679:             // Find out, whether two current characters are LV and T
680:             if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
681:                     && 0 <= $tindex && $tindex <= $this->_tcount) {
682:                 // create syllable of form LVT
683:                 $last += $tindex;
684:                 $result[(count($result) - 1)] = $last; // reset last
685:                 continue; // discard char
686:             }
687:             // Find out, whether two current characters form L and V
688:             if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
689:                 // create syllable of form LV
690:                 $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
691:                 $result[(count($result) - 1)] = $last; // reset last
692:                 continue; // discard char
693:             }
694:             // if neither case was true, just add the character
695:             $last = $char;
696:             $result[] = $char;
697:         }
698:         return $result;
699:     }
700: 
701:     /**
702:      * Returns the combining class of a certain wide char
703:      * @param    integer    Wide char to check (32bit integer)
704:      * @return   integer    Combining class if found, else 0
705:      * @access   private
706:      */
707:     function _get_combining_class($char)
708:     {
709:         return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
710:     }
711: 
712:     /**
713:      * Apllies the cannonical ordering of a decomposed UCS4 sequence
714:      * @param    array      Decomposed UCS4 sequence
715:      * @return   array      Ordered USC4 sequence
716:      * @access   private
717:      */
718:     function _apply_cannonical_ordering($input)
719:     {
720:         $swap = true;
721:         $size = count($input);
722:         while ($swap) {
723:             $swap = false;
724:             $last = $this->_get_combining_class(intval($input[0]));
725:             for ($i = 0; $i < $size-1; ++$i) {
726:                 $next = $this->_get_combining_class(intval($input[$i+1]));
727:                 if ($next != 0 && $last > $next) {
728:                     // Move item leftward until it fits
729:                     for ($j = $i + 1; $j > 0; --$j) {
730:                         if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
731:                         $t = intval($input[$j]);
732:                         $input[$j] = intval($input[$j-1]);
733:                         $input[$j-1] = $t;
734:                         $swap = true;
735:                     }
736:                     // Reentering the loop looking at the old character again
737:                     $next = $last;
738:                 }
739:                 $last = $next;
740:             }
741:         }
742:         return $input;
743:     }
744: 
745:     /**
746:      * Do composition of a sequence of starter and non-starter
747:      * @param    array      UCS4 Decomposed sequence
748:      * @return   array      Ordered USC4 sequence
749:      * @access   private
750:      */
751:     function _combine($input)
752:     {
753:         $inp_len = count($input);
754:         foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
755:             if ($np_target[0] != $input[0]) continue;
756:             if (count($np_target) != $inp_len) continue;
757:             $hit = false;
758:             foreach ($input as $k2 => $v2) {
759:                 if ($v2 == $np_target[$k2]) {
760:                     $hit = true;
761:                 } else {
762:                     $hit = false;
763:                     break;
764:                 }
765:             }
766:             if ($hit) return $np_src;
767:         }
768:         return false;
769:     }
770: 
771:     /**
772:      * This converts an UTF-8 encoded string to its UCS-4 representation
773:      * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
774:      * each of the "chars". This is due to PHP not being able to handle strings with
775:      * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
776:      * The following UTF-8 encodings are supported:
777:      * bytes bits  representation
778:      * 1        7  0xxxxxxx
779:      * 2       11  110xxxxx 10xxxxxx
780:      * 3       16  1110xxxx 10xxxxxx 10xxxxxx
781:      * 4       21  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
782:      * 5       26  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
783:      * 6       31  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
784:      * Each x represents a bit that can be used to store character data.
785:      * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
786:      * @access   private
787:      */
788:     function _utf8_to_ucs4($input)
789:     {
790:         $output = array();
791:         $out_len = 0;
792:         $inp_len = strlen($input);
793:         $mode = 'next';
794:         $test = 'none';
795:         for ($k = 0; $k < $inp_len; ++$k) {
796:             $v = ord($input{$k}); // Extract byte from input string
797: 
798:             if ($v < 128) { // We found an ASCII char - put into stirng as is
799:                 $output[$out_len] = $v;
800:                 ++$out_len;
801:                 if ('add' == $mode) {
802:                     $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
803:                     return false;
804:                 }
805:                 continue;
806:             }
807:             if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
808:                 $start_byte = $v;
809:                 $mode = 'add';
810:                 $test = 'range';
811:                 if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
812:                     $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
813:                     $v = ($v - 192) << 6;
814:                 } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
815:                     $next_byte = 1;
816:                     $v = ($v - 224) << 12;
817:                 } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
818:                     $next_byte = 2;
819:                     $v = ($v - 240) << 18;
820:                 } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
821:                     $next_byte = 3;
822:                     $v = ($v - 248) << 24;
823:                 } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
824:                     $next_byte = 4;
825:                     $v = ($v - 252) << 30;
826:                 } else {
827:                     $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
828:                     return false;
829:                 }
830:                 if ('add' == $mode) {
831:                     $output[$out_len] = (int) $v;
832:                     ++$out_len;
833:                     continue;
834:                 }
835:             }
836:             if ('add' == $mode) {
837:                 if (!$this->_allow_overlong && $test == 'range') {
838:                     $test = 'none';
839:                     if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
840:                         $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
841:                         return false;
842:                     }
843:                 }
844:                 if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
845:                     $v = ($v - 128) << ($next_byte * 6);
846:                     $output[($out_len - 1)] += $v;
847:                     --$next_byte;
848:                 } else {
849:                     $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
850:                     return false;
851:                 }
852:                 if ($next_byte < 0) {
853:                     $mode = 'next';
854:                 }
855:             }
856:         } // for
857:         return $output;
858:     }
859: 
860:     /**
861:      * Convert UCS-4 string into UTF-8 string
862:      * See _utf8_to_ucs4() for details
863:      * @access   private
864:      */
865:     function _ucs4_to_utf8($input)
866:     {
867:         $output = '';
868:         $k = 0;
869:         foreach ($input as $v) {
870:             ++$k;
871:             // $v = ord($v);
872:             if ($v < 128) { // 7bit are transferred literally
873:                 $output .= chr($v);
874:             } elseif ($v < (1 << 11)) { // 2 bytes
875:                 $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
876:             } elseif ($v < (1 << 16)) { // 3 bytes
877:                 $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
878:             } elseif ($v < (1 << 21)) { // 4 bytes
879:                 $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
880:                          . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
881:             } elseif ($v < (1 << 26)) { // 5 bytes
882:                 $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
883:                          . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
884:                          . chr(128 + ($v & 63));
885:             } elseif ($v < (1 << 31)) { // 6 bytes
886:                 $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
887:                          . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
888:                          . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
889:             } else {
890:                 $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
891:                 return false;
892:             }
893:         }
894:         return $output;
895:     }
896: 
897:     /**
898:       * Convert UCS-4 array into UCS-4 string
899:       *
900:       * @access   private
901:       */
902:     function _ucs4_to_ucs4_string($input)
903:     {
904:         $output = '';
905:         // Take array values and split output to 4 bytes per value
906:         // The bit mask is 255, which reads &11111111
907:         foreach ($input as $v) {
908:             $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
909:         }
910:         return $output;
911:     }
912: 
913:     /**
914:       * Convert UCS-4 strin into UCS-4 garray
915:       *
916:       * @access   private
917:       */
918:     function _ucs4_string_to_ucs4($input)
919:     {
920:         $output = array();
921:         $inp_len = strlen($input);
922:         // Input length must be dividable by 4
923:         if ($inp_len % 4) {
924:             $this->_error('Input UCS4 string is broken');
925:             return false;
926:         }
927:         // Empty input - return empty output
928:         if (!$inp_len) return $output;
929:         for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
930:             // Increment output position every 4 input bytes
931:             if (!($i % 4)) {
932:                 $out_len++;
933:                 $output[$out_len] = 0;
934:             }
935:             $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
936:         }
937:         return $output;
938:     }
939: }
940: 
941: /**
942: * Adapter class for aligning the API of idna_convert with that of Net_IDNA
943: * @author  Matthias Sommerfeld <mso@phlylabs.de>
944: */
945: class Net_IDNA_php4 extends idna_convert
946: {
947:     /**
948:      * Sets a new option value. Available options and values:
949:      * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
950:      *         'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
951:      * [overlong - Unicode does not allow unnecessarily long encodings of chars,
952:      *             to allow this, set this parameter to true, else to false;
953:      *             default is false.]
954:      * [strict - true: strict mode, good for registration purposes - Causes errors
955:      *           on failures; false: loose mode, ideal for "wildlife" applications
956:      *           by silently ignoring errors and returning the original input instead
957:      *
958:      * @param    mixed     Parameter to set (string: single parameter; array of Parameter => Value pairs)
959:      * @param    string    Value to use (if parameter 1 is a string)
960:      * @return   boolean   true on success, false otherwise
961:      * @access   public
962:      */
963:     function setParams($option, $param = false)
964:     {
965:         return $this->IC->set_parameters($option, $param);
966:     }
967: }
968: 
969: ?>
Packages

Classes

Functions