2013年12月28日土曜日

Count up letters, bytes in UTF-8 and surrogate pairs in JavaScript

Below is a sample code in JavaScript to count up letters, bytes in UTF-8 and surrogate pairs. Available as a gist.



0 letter(s).
0 byte(s) in UTF-8.
0 surrogate pair(s).


<!DOCTYPE html>
<html>
<!--
* Copyright (C) 2013 Neo Visionaries Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
-->
<head>
<meta http-equiv="Content-Type" value="text/html;charset=UTF-8"/>
<meta name="author" content="Takahiko Kawasaki">
<title>Count up letters, bytes in UTF-8 and surrogate pairs in JavaScript</title>
<script type="text/javascript">
function compute_bytes_in_utf8(codePoint)
{
// Unicode code points and their corresponding values
// encoded in UTF-16BE are identical except code points
// that are higher than U+FFFF.
if (codePoint <= 0x007F)
{
// U+0000 - U+007F: 1 bytes in UTF-8.
return 1;
}
else if (codePoint <= 0x07FF)
{
// U+0080 - U+07FF: 2 bytes in UTF-8.
return 2;
}
else if (codePoint <= 0xD7FF)
{
// U+0800 - U+D7FF: 3 bytes in UTF-8.
return 3;
}
else if (codePoint <= 0xDFFF)
{
// 0xD800 - 0xDBFF: High surrogates.
// 0xDC00 - 0xDFFF: Low surrogates.
//
// The range represented by surrogate pairs is
// U+10000 - U+10FFFF, and characters in the range
// consume 4 bytes in UTF-8. Luckily, 4 can be divided
// by 2 (2 here means 1 high surrogate + 1 low surrogate),
// so this implementation returns 2 (= 4 / 2) here.
return 2;
}
else if (codePoint <= 0xFFFF)
{
// U+E000 - U+FFFF: 3 bytes in UTF-8.
return 3;
}
else
{
// U+10000 - ...: This won't happen in UTF-16.
return 0;
}
}
function count_up()
{
// HTML elements for input and output.
var input = document.getElementById("input").value;
var outputLetters = document.getElementById("outputLetters");
var outputBytes = document.getElementById("outputBytes")
var outputPairs = document.getElementById("outputPairs")
// Counters for letters, bytes in UTF-8 and surrogate pairs.
var nLetters = 0;
var nBytes = 0;
var nPairs = 0;
// For each code points in the input string.
for (var i = 0; i < input.length; ++i)
{
// Get the code point of the character at the position.
//
// Note that charCodeAt() always returns a value that is
// less than 65,536. Higher code points (= U+10000 and
// higher) are represented by surrogate pairs.
var codePoint = input.charCodeAt(i);
// If the code point is not in the range of low surrogates.
if (codePoint <= 0xDBFF || 0xE000 <= codePoint)
{
// Count up the number of letters.
++nLetters;
}
// If the code point is in the range of high surrogates.
if (0xD800 <= codePoint && codePoint <= 0xDBFF)
{
// Count up the number of surrogate pairs.
++nPairs;
}
// Compute the number of bytes when the code point is
// encoded in UTF-8.
nBytes += compute_bytes_in_utf8(codePoint);
}
// Write results.
outputLetters.innerHTML = nLetters;
outputBytes.innerHTML = nBytes;
outputPairs.innerHTML = nPairs;
}
</script>
<body>
<!-- Input -->
<input id="input" type="text" onInput="count_up()"><br/>
<!-- Output: Number of letters-->
<span id="outputLetters">0</span> letter(s).<br/>
<!-- Output: Number of bytes in UTF-8 -->
<span id="outputBytes">0</span> byte(s) in UTF-8.<br/>
<!-- Output: Number of surrogate pairs -->
<span id="outputPairs">0</span> surrogate pair(s).<br/>
</body>
</html>
view raw gistfile1.html hosted with ❤ by GitHub