punycode.es6.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. 'use strict';
  2. /** Highest positive signed 32-bit float value */
  3. const maxInt = 2147483647; // aka. 0x7FFFFFFF or 2^31-1
  4. /** Bootstring parameters */
  5. const base = 36;
  6. const tMin = 1;
  7. const tMax = 26;
  8. const skew = 38;
  9. const damp = 700;
  10. const initialBias = 72;
  11. const initialN = 128; // 0x80
  12. const delimiter = '-'; // '\x2D'
  13. /** Regular expressions */
  14. const regexPunycode = /^xn--/;
  15. const regexNonASCII = /[^\0-\x7F]/; // Note: U+007F DEL is excluded too.
  16. const regexSeparators = /[\x2E\u3002\uFF0E\uFF61]/g; // RFC 3490 separators
  17. /** Error messages */
  18. const errors = {
  19. 'overflow': 'Overflow: input needs wider integers to process',
  20. 'not-basic': 'Illegal input >= 0x80 (not a basic code point)',
  21. 'invalid-input': 'Invalid input'
  22. };
  23. /** Convenience shortcuts */
  24. const baseMinusTMin = base - tMin;
  25. const floor = Math.floor;
  26. const stringFromCharCode = String.fromCharCode;
  27. /*--------------------------------------------------------------------------*/
  28. /**
  29. * A generic error utility function.
  30. * @private
  31. * @param {String} type The error type.
  32. * @returns {Error} Throws a `RangeError` with the applicable error message.
  33. */
  34. function error(type) {
  35. throw new RangeError(errors[type]);
  36. }
  37. /**
  38. * A generic `Array#map` utility function.
  39. * @private
  40. * @param {Array} array The array to iterate over.
  41. * @param {Function} callback The function that gets called for every array
  42. * item.
  43. * @returns {Array} A new array of values returned by the callback function.
  44. */
  45. function map(array, callback) {
  46. const result = [];
  47. let length = array.length;
  48. while (length--) {
  49. result[length] = callback(array[length]);
  50. }
  51. return result;
  52. }
  53. /**
  54. * A simple `Array#map`-like wrapper to work with domain name strings or email
  55. * addresses.
  56. * @private
  57. * @param {String} domain The domain name or email address.
  58. * @param {Function} callback The function that gets called for every
  59. * character.
  60. * @returns {String} A new string of characters returned by the callback
  61. * function.
  62. */
  63. function mapDomain(domain, callback) {
  64. const parts = domain.split('@');
  65. let result = '';
  66. if (parts.length > 1) {
  67. // In email addresses, only the domain name should be punycoded. Leave
  68. // the local part (i.e. everything up to `@`) intact.
  69. result = parts[0] + '@';
  70. domain = parts[1];
  71. }
  72. // Avoid `split(regex)` for IE8 compatibility. See #17.
  73. domain = domain.replace(regexSeparators, '\x2E');
  74. const labels = domain.split('.');
  75. const encoded = map(labels, callback).join('.');
  76. return result + encoded;
  77. }
  78. /**
  79. * Creates an array containing the numeric code points of each Unicode
  80. * character in the string. While JavaScript uses UCS-2 internally,
  81. * this function will convert a pair of surrogate halves (each of which
  82. * UCS-2 exposes as separate characters) into a single code point,
  83. * matching UTF-16.
  84. * @see `punycode.ucs2.encode`
  85. * @see <https://mathiasbynens.be/notes/javascript-encoding>
  86. * @memberOf punycode.ucs2
  87. * @name decode
  88. * @param {String} string The Unicode input string (UCS-2).
  89. * @returns {Array} The new array of code points.
  90. */
  91. function ucs2decode(string) {
  92. const output = [];
  93. let counter = 0;
  94. const length = string.length;
  95. while (counter < length) {
  96. const value = string.charCodeAt(counter++);
  97. if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
  98. // It's a high surrogate, and there is a next character.
  99. const extra = string.charCodeAt(counter++);
  100. if ((extra & 0xFC00) == 0xDC00) { // Low surrogate.
  101. output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
  102. } else {
  103. // It's an unmatched surrogate; only append this code unit, in case the
  104. // next code unit is the high surrogate of a surrogate pair.
  105. output.push(value);
  106. counter--;
  107. }
  108. } else {
  109. output.push(value);
  110. }
  111. }
  112. return output;
  113. }
  114. /**
  115. * Creates a string based on an array of numeric code points.
  116. * @see `punycode.ucs2.decode`
  117. * @memberOf punycode.ucs2
  118. * @name encode
  119. * @param {Array} codePoints The array of numeric code points.
  120. * @returns {String} The new Unicode string (UCS-2).
  121. */
  122. const ucs2encode = codePoints => String.fromCodePoint(...codePoints);
  123. /**
  124. * Converts a basic code point into a digit/integer.
  125. * @see `digitToBasic()`
  126. * @private
  127. * @param {Number} codePoint The basic numeric code point value.
  128. * @returns {Number} The numeric value of a basic code point (for use in
  129. * representing integers) in the range `0` to `base - 1`, or `base` if
  130. * the code point does not represent a value.
  131. */
  132. const basicToDigit = function(codePoint) {
  133. if (codePoint >= 0x30 && codePoint < 0x3A) {
  134. return 26 + (codePoint - 0x30);
  135. }
  136. if (codePoint >= 0x41 && codePoint < 0x5B) {
  137. return codePoint - 0x41;
  138. }
  139. if (codePoint >= 0x61 && codePoint < 0x7B) {
  140. return codePoint - 0x61;
  141. }
  142. return base;
  143. };
  144. /**
  145. * Converts a digit/integer into a basic code point.
  146. * @see `basicToDigit()`
  147. * @private
  148. * @param {Number} digit The numeric value of a basic code point.
  149. * @returns {Number} The basic code point whose value (when used for
  150. * representing integers) is `digit`, which needs to be in the range
  151. * `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
  152. * used; else, the lowercase form is used. The behavior is undefined
  153. * if `flag` is non-zero and `digit` has no uppercase form.
  154. */
  155. const digitToBasic = function(digit, flag) {
  156. // 0..25 map to ASCII a..z or A..Z
  157. // 26..35 map to ASCII 0..9
  158. return digit + 22 + 75 * (digit < 26) - ((flag != 0) << 5);
  159. };
  160. /**
  161. * Bias adaptation function as per section 3.4 of RFC 3492.
  162. * https://tools.ietf.org/html/rfc3492#section-3.4
  163. * @private
  164. */
  165. const adapt = function(delta, numPoints, firstTime) {
  166. let k = 0;
  167. delta = firstTime ? floor(delta / damp) : delta >> 1;
  168. delta += floor(delta / numPoints);
  169. for (/* no initialization */; delta > baseMinusTMin * tMax >> 1; k += base) {
  170. delta = floor(delta / baseMinusTMin);
  171. }
  172. return floor(k + (baseMinusTMin + 1) * delta / (delta + skew));
  173. };
  174. /**
  175. * Converts a Punycode string of ASCII-only symbols to a string of Unicode
  176. * symbols.
  177. * @memberOf punycode
  178. * @param {String} input The Punycode string of ASCII-only symbols.
  179. * @returns {String} The resulting string of Unicode symbols.
  180. */
  181. const decode = function(input) {
  182. // Don't use UCS-2.
  183. const output = [];
  184. const inputLength = input.length;
  185. let i = 0;
  186. let n = initialN;
  187. let bias = initialBias;
  188. // Handle the basic code points: let `basic` be the number of input code
  189. // points before the last delimiter, or `0` if there is none, then copy
  190. // the first basic code points to the output.
  191. let basic = input.lastIndexOf(delimiter);
  192. if (basic < 0) {
  193. basic = 0;
  194. }
  195. for (let j = 0; j < basic; ++j) {
  196. // if it's not a basic code point
  197. if (input.charCodeAt(j) >= 0x80) {
  198. error('not-basic');
  199. }
  200. output.push(input.charCodeAt(j));
  201. }
  202. // Main decoding loop: start just after the last delimiter if any basic code
  203. // points were copied; start at the beginning otherwise.
  204. for (let index = basic > 0 ? basic + 1 : 0; index < inputLength; /* no final expression */) {
  205. // `index` is the index of the next character to be consumed.
  206. // Decode a generalized variable-length integer into `delta`,
  207. // which gets added to `i`. The overflow checking is easier
  208. // if we increase `i` as we go, then subtract off its starting
  209. // value at the end to obtain `delta`.
  210. const oldi = i;
  211. for (let w = 1, k = base; /* no condition */; k += base) {
  212. if (index >= inputLength) {
  213. error('invalid-input');
  214. }
  215. const digit = basicToDigit(input.charCodeAt(index++));
  216. if (digit >= base) {
  217. error('invalid-input');
  218. }
  219. if (digit > floor((maxInt - i) / w)) {
  220. error('overflow');
  221. }
  222. i += digit * w;
  223. const t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias);
  224. if (digit < t) {
  225. break;
  226. }
  227. const baseMinusT = base - t;
  228. if (w > floor(maxInt / baseMinusT)) {
  229. error('overflow');
  230. }
  231. w *= baseMinusT;
  232. }
  233. const out = output.length + 1;
  234. bias = adapt(i - oldi, out, oldi == 0);
  235. // `i` was supposed to wrap around from `out` to `0`,
  236. // incrementing `n` each time, so we'll fix that now:
  237. if (floor(i / out) > maxInt - n) {
  238. error('overflow');
  239. }
  240. n += floor(i / out);
  241. i %= out;
  242. // Insert `n` at position `i` of the output.
  243. output.splice(i++, 0, n);
  244. }
  245. return String.fromCodePoint(...output);
  246. };
  247. /**
  248. * Converts a string of Unicode symbols (e.g. a domain name label) to a
  249. * Punycode string of ASCII-only symbols.
  250. * @memberOf punycode
  251. * @param {String} input The string of Unicode symbols.
  252. * @returns {String} The resulting Punycode string of ASCII-only symbols.
  253. */
  254. const encode = function(input) {
  255. const output = [];
  256. // Convert the input in UCS-2 to an array of Unicode code points.
  257. input = ucs2decode(input);
  258. // Cache the length.
  259. const inputLength = input.length;
  260. // Initialize the state.
  261. let n = initialN;
  262. let delta = 0;
  263. let bias = initialBias;
  264. // Handle the basic code points.
  265. for (const currentValue of input) {
  266. if (currentValue < 0x80) {
  267. output.push(stringFromCharCode(currentValue));
  268. }
  269. }
  270. const basicLength = output.length;
  271. let handledCPCount = basicLength;
  272. // `handledCPCount` is the number of code points that have been handled;
  273. // `basicLength` is the number of basic code points.
  274. // Finish the basic string with a delimiter unless it's empty.
  275. if (basicLength) {
  276. output.push(delimiter);
  277. }
  278. // Main encoding loop:
  279. while (handledCPCount < inputLength) {
  280. // All non-basic code points < n have been handled already. Find the next
  281. // larger one:
  282. let m = maxInt;
  283. for (const currentValue of input) {
  284. if (currentValue >= n && currentValue < m) {
  285. m = currentValue;
  286. }
  287. }
  288. // Increase `delta` enough to advance the decoder's <n,i> state to <m,0>,
  289. // but guard against overflow.
  290. const handledCPCountPlusOne = handledCPCount + 1;
  291. if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) {
  292. error('overflow');
  293. }
  294. delta += (m - n) * handledCPCountPlusOne;
  295. n = m;
  296. for (const currentValue of input) {
  297. if (currentValue < n && ++delta > maxInt) {
  298. error('overflow');
  299. }
  300. if (currentValue === n) {
  301. // Represent delta as a generalized variable-length integer.
  302. let q = delta;
  303. for (let k = base; /* no condition */; k += base) {
  304. const t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias);
  305. if (q < t) {
  306. break;
  307. }
  308. const qMinusT = q - t;
  309. const baseMinusT = base - t;
  310. output.push(
  311. stringFromCharCode(digitToBasic(t + qMinusT % baseMinusT, 0))
  312. );
  313. q = floor(qMinusT / baseMinusT);
  314. }
  315. output.push(stringFromCharCode(digitToBasic(q, 0)));
  316. bias = adapt(delta, handledCPCountPlusOne, handledCPCount === basicLength);
  317. delta = 0;
  318. ++handledCPCount;
  319. }
  320. }
  321. ++delta;
  322. ++n;
  323. }
  324. return output.join('');
  325. };
  326. /**
  327. * Converts a Punycode string representing a domain name or an email address
  328. * to Unicode. Only the Punycoded parts of the input will be converted, i.e.
  329. * it doesn't matter if you call it on a string that has already been
  330. * converted to Unicode.
  331. * @memberOf punycode
  332. * @param {String} input The Punycoded domain name or email address to
  333. * convert to Unicode.
  334. * @returns {String} The Unicode representation of the given Punycode
  335. * string.
  336. */
  337. const toUnicode = function(input) {
  338. return mapDomain(input, function(string) {
  339. return regexPunycode.test(string)
  340. ? decode(string.slice(4).toLowerCase())
  341. : string;
  342. });
  343. };
  344. /**
  345. * Converts a Unicode string representing a domain name or an email address to
  346. * Punycode. Only the non-ASCII parts of the domain name will be converted,
  347. * i.e. it doesn't matter if you call it with a domain that's already in
  348. * ASCII.
  349. * @memberOf punycode
  350. * @param {String} input The domain name or email address to convert, as a
  351. * Unicode string.
  352. * @returns {String} The Punycode representation of the given domain name or
  353. * email address.
  354. */
  355. const toASCII = function(input) {
  356. return mapDomain(input, function(string) {
  357. return regexNonASCII.test(string)
  358. ? 'xn--' + encode(string)
  359. : string;
  360. });
  361. };
  362. /*--------------------------------------------------------------------------*/
  363. /** Define the public API */
  364. const punycode = {
  365. /**
  366. * A string representing the current Punycode.js version number.
  367. * @memberOf punycode
  368. * @type String
  369. */
  370. 'version': '2.1.0',
  371. /**
  372. * An object of methods to convert from JavaScript's internal character
  373. * representation (UCS-2) to Unicode code points, and back.
  374. * @see <https://mathiasbynens.be/notes/javascript-encoding>
  375. * @memberOf punycode
  376. * @type Object
  377. */
  378. 'ucs2': {
  379. 'decode': ucs2decode,
  380. 'encode': ucs2encode
  381. },
  382. 'decode': decode,
  383. 'encode': encode,
  384. 'toASCII': toASCII,
  385. 'toUnicode': toUnicode
  386. };
  387. export { ucs2decode, ucs2encode, decode, encode, toASCII, toUnicode };
  388. export default punycode;