Difference between revisions of "Module:Unicode data"

From annadreambrush.com/wiki
Jump to navigation Jump to search
imported>Erutuon
(fix Hangul Syllable function)
m (1 revision imported)
 
(53 intermediate revisions by 5 users not shown)
Line 1: Line 1:
local export = {}
+
local p = {}
  
 
local floor = math.floor
 
local floor = math.floor
  
-- The following leads, vowels, and trails come from here:
+
local function errorf(level, ...)
-- http://www.unicode.org/Public/UNIDATA/Jamo.txt
+
if type(level) == "number" then
 +
return error(string.format(...), level + 1)
 +
else -- level is actually the format string.
 +
return error(string.format(level, ...), 2)
 +
end
 +
end
 +
 
 +
local function binary_range_search(codepoint, ranges)
 +
local low, mid, high
 +
low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
 +
while low <= high do
 +
mid = floor((low + high) / 2)
 +
local range = ranges[mid]
 +
if codepoint < range[1] then
 +
high = mid - 1
 +
elseif codepoint <= range[2] then
 +
return range, mid
 +
else
 +
low = mid + 1
 +
end
 +
end
 +
return nil, mid
 +
end
 +
p.binary_range_search = binary_range_search
 +
 
 +
--[[
 +
local function linear_range_search(codepoint, ranges)
 +
for i, range in ipairs(ranges) do
 +
if range[1] <= codepoint and codepoint <= range[2] then
 +
return range
 +
end
 +
end
 +
end
 +
--]]
 +
 
 +
-- Load a module by indexing "loader" with the name of the module minus the
 +
-- "Module:Unicode data/" part. For instance, loader.blocks returns
 +
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be
 +
-- returned.
 +
local loader = setmetatable({}, {
 +
__index = function (self, key)
 +
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
 +
if not success then
 +
data = false
 +
end
 +
self[key] = data
 +
return data
 +
end
 +
})
  
 
-- For the algorithm used to generate Hangul Syllable names,
 
-- For the algorithm used to generate Hangul Syllable names,
Line 10: Line 58:
 
-- Unicode Specification:
 
-- Unicode Specification:
 
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
 
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local hangul_leads = {
 
[0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS",
 
"", "J", "JJ", "C", "K", "T", "P", "H"
 
}
 
hangul_leads.length = #hangul_leads + 1
 
 
local hangul_vowels = {
 
[0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
 
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
 
"I"
 
}
 
hangul_vowel_count = #hangul_vowels + 1
 
 
local hangul_trails = {
 
[0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB",
 
"LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K",
 
"T", "P", "H"
 
}
 
hangul_trail_count = #hangul_trails + 1
 
 
hangul_codas = hangul_vowel_count * hangul_trail_count
 
 
 
local name_hooks = {
 
local name_hooks = {
 
{    0x00,    0x1F, "<control-%04X>" }, -- C0 control characters
 
{    0x00,    0x1F, "<control-%04X>" }, -- C0 control characters
 
{    0x7F,    0x9F, "<control-%04X>" }, -- DEL and C1 control characters
 
{    0x7F,    0x9F, "<control-%04X>" }, -- DEL and C1 control characters
 
{  0x3400,  0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
 
{  0x3400,  0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{  0x4E00,  0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph --change v10
+
{  0x4E00,  0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
{  0xAC00,  0xD7A3, function (codepoint)
+
{  0xAC00,  0xD7A3, function (codepoint) -- Hangul Syllables
 +
local Hangul_data = loader.Hangul
 
local syllable_index = codepoint - 0xAC00
 
local syllable_index = codepoint - 0xAC00
  
 
return ("HANGUL SYLLABLE %s%s%s"):format(
 
return ("HANGUL SYLLABLE %s%s%s"):format(
hangul_leads[floor(syllable_index / hangul_codas)],
+
Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
hangul_vowels[floor((syllable_index % hangul_codas) / hangul_trail_count)],
+
Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
hangul_trails[syllable_index % hangul_trail_count]
+
/ Hangul_data.trail_count)],
 +
Hangul_data.trails[syllable_index % Hangul_data.trail_count]
 
)
 
)
 
end },
 
end },
{  0xD800,  0xDB7F, "<surrogate-%04X>" }, -- Non Private Use High Surrogate
+
-- High Surrogates, High Private Use Surrogates, Low Surrogates
{  0xDB80,   0xDBFF, "<surrogate-%04X>" }, -- Private Use High Surrogate
+
0xD800,  0xDFFF, "<surrogate-%04X>" },
0xDC00,  0xDFFF, "<surrogate-%04X>" }, -- Low Surrogate
 
 
{  0xE000,  0xF8FF, "<private-use-%04X>" }, -- Private Use
 
{  0xE000,  0xF8FF, "<private-use-%04X>" }, -- Private Use
{  0x17000,  0x187F1, "TANGUT IDEOGRAPH-%05X" }, -- Tangut
+
-- CJK Compatibility Ideographs
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%05X" }, -- Nushu --add v10
+
{  0xF900,  0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x20000,  0x2A6D6, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension B
+
{  0xFA70,  0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x2A700,  0x2B734, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension C
+
{  0x17000,  0x187F1, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
{  0x2A740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension D
+
{  0x18800,  0x18AF2, function (codepoint)
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension E
+
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension F --add v10
+
end },
{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%05X" }, -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
+
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{  0xF0000,  0xFFFFD, "<private-use-%05X>" }, -- Plane 15 Private Use
+
{  0x20000,  0x2A6D6, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{ 0x100000, 0x10FFFD, "<private-use-%06X>" }  -- Plane 16 Private Use
+
{  0x2A700,  0x2B734, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
 +
{  0x2A740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
 +
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
 +
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
 +
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
 +
{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
 +
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
 +
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
 +
end},
 +
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
 +
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
 
}
 
}
 +
name_hooks.length = #name_hooks
  
 
local name_range_cache
 
local name_range_cache
Line 71: Line 109:
 
end
 
end
 
end
 
end
 +
 +
--[[
 +
-- Checks that the code point is a number and in range.
 +
-- Does not check whether code point is an integer.
 +
-- Not used
 +
local function check_codepoint(funcName, argIdx, val)
 +
require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
 +
if codepoint < 0 or 0x10FFFF < codepoint then
 +
errorf("Codepoint %04X out of range", codepoint)
 +
end
 +
end
 +
--]]
  
 
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
 
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function export.lookup_name(codepoint)
+
function p.lookup_name(codepoint)
-- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are noncharacters:
+
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
 +
-- (Cn) and specifically noncharacters:
 
-- https://www.unicode.org/faq/private_use.html#nonchar4
 
-- https://www.unicode.org/faq/private_use.html#nonchar4
 
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
 
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
or math.floor(codepoint % 0x10000) >= 0xFFFE) then
+
or floor(codepoint % 0x10000) >= 0xFFFE) then
 
return ("<noncharacter-%04X>"):format(codepoint)
 
return ("<noncharacter-%04X>"):format(codepoint)
 
end
 
end
  
if name_range_cache then
+
if name_range_cache -- Check if previously used "name hook" applies to this code point.
if (codepoint >= name_range_cache[1]) and (codepoint <= name_range_cache[2]) then
+
and codepoint >= name_range_cache[1]
return generate_name(name_range_cache[3], codepoint)
+
and codepoint <= name_range_cache[2] then
end
+
return generate_name(name_range_cache[3], codepoint)
 
end
 
end
 
+
for _, item in ipairs(name_hooks) do
+
local range = binary_range_search(codepoint, name_hooks)
if codepoint < item[1] then
+
if range then
break
+
name_range_cache = range
elseif codepoint <= item[2] then
+
return generate_name(range[3], codepoint)
name_range_cache = item
 
return generate_name(item[3], codepoint)
 
end
 
 
end
 
end
  
local success, data = pcall(mw.loadData,
+
local data = loader[('names/%03X'):format(codepoint / 0x1000)]
('Module:Unicode data/names/%03X'):format(codepoint / 0x1000))
 
 
 
if success and data[codepoint] then
+
if data and data[codepoint] then
 
return data[codepoint]
 
return data[codepoint]
 
 
-- Unassigned (Cn) includes noncharacters and reserved characters.
+
-- Unassigned (Cn) consists of noncharacters and reserved characters.
-- The character is not a noncharacter and if it were assigned, its name
+
-- The character has been established not to be a noncharacter,
-- would already been retrieved, so it must be reserved.
+
-- and if it were assigned, its name would already been retrieved,
 +
-- so it must be reserved.
 
else
 
else
 
return ("<reserved-%04X>"):format(codepoint)
 
return ("<reserved-%04X>"):format(codepoint)
Line 110: Line 158:
 
end
 
end
  
function export.lookup_image(codepoint)
+
--[[
local success, data = pcall(mw.loadData,
+
-- No image data modules on Wikipedia yet.
('Module:Unicode data/images/%03X'):format(codepoint / 0x1000)
+
function p.lookup_image(codepoint)
)
+
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
 
 
if success then
+
if data then
 
return data[codepoint]
 
return data[codepoint]
 
end
 
end
 
end
 
end
 
+
--]]
function export.template_lookup_name(frame)
 
local codepoint = tonumber(frame.args[1] or frame:getParent().args[1])
 
local name = export.lookup_name(codepoint)
 
return name:gsub("<", "&lt;")
 
end
 
  
 
local planes = {
 
local planes = {
Line 130: Line 173:
 
[ 1] = "Supplementary Multilingual Plane";
 
[ 1] = "Supplementary Multilingual Plane";
 
[ 2] = "Supplementary Ideographic Plane";
 
[ 2] = "Supplementary Ideographic Plane";
[13] = "Supplementary Special-purpose Plane";
+
[14] = "Supplementary Special-purpose Plane";
[14] = "Supplementary Private Use Area-A";
+
[15] = "Supplementary Private Use Area-A";
[15] = "Supplementary Private Use Area-B";
+
[16] = "Supplementary Private Use Area-B";
 
}
 
}
  
-- http://www.unicode.org/Public/UNIDATA/Blocks.txt
+
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
-- This should be kept synchronized with [[Module:category tree/scriptcatboiler/blocks]].
+
local blocks
local blocks = {
+
 
{ "Basic Latin",                                    0x000000, 0x00007F },
+
local function block_iter(blocks, i)
{ "Latin-1 Supplement",                              0x000080, 0x0000FF },
+
i = i + 1
{ "Latin Extended-A",                                0x000100, 0x00017F },
+
local data = blocks[i]
{ "Latin Extended-B",                                0x000180, 0x00024F },
+
if data then
{ "IPA Extensions",                                  0x000250, 0x0002AF },
+
-- Unpack doesn't work on tables loaded with mw.loadData.
{ "Spacing Modifier Letters",                        0x0002B0, 0x0002FF },
+
return i, data[1], data[2], data[3]
{ "Combining Diacritical Marks",                    0x000300, 0x00036F },
+
end
{ "Greek and Coptic",                                0x000370, 0x0003FF },
+
end
{ "Cyrillic",                                        0x000400, 0x0004FF },
 
{ "Cyrillic Supplement",                            0x000500, 0x00052F },
 
{ "Armenian",                                        0x000530, 0x00058F },
 
{ "Hebrew",                                          0x000590, 0x0005FF },
 
{ "Arabic",                                          0x000600, 0x0006FF },
 
{ "Syriac",                                          0x000700, 0x00074F },
 
{ "Arabic Supplement",                              0x000750, 0x00077F },
 
{ "Thaana",                                          0x000780, 0x0007BF },
 
{ "NKo",                                            0x0007C0, 0x0007FF },
 
{ "Samaritan",                                      0x000800, 0x00083F },
 
{ "Mandaic",                                        0x000840, 0x00085F },
 
{ "Syriac Supplement",                              0x000860, 0x00086F },
 
{ "Arabic Extended-A",                              0x0008A0, 0x0008FF },
 
{ "Devanagari",                                      0x000900, 0x00097F },
 
{ "Bengali",                                        0x000980, 0x0009FF },
 
{ "Gurmukhi",                                        0x000A00, 0x000A7F },
 
{ "Gujarati",                                        0x000A80, 0x000AFF },
 
{ "Oriya",                                          0x000B00, 0x000B7F },
 
{ "Tamil",                                          0x000B80, 0x000BFF },
 
{ "Telugu",                                          0x000C00, 0x000C7F },
 
{ "Kannada",                                        0x000C80, 0x000CFF },
 
{ "Malayalam",                                      0x000D00, 0x000D7F },
 
{ "Sinhala",                                        0x000D80, 0x000DFF },
 
{ "Thai",                                            0x000E00, 0x000E7F },
 
{ "Lao",                                            0x000E80, 0x000EFF },
 
{ "Tibetan",                                        0x000F00, 0x000FFF },
 
{ "Myanmar",                                        0x001000, 0x00109F },
 
{ "Georgian",                                        0x0010A0, 0x0010FF },
 
{ "Hangul Jamo",                                    0x001100, 0x0011FF },
 
{ "Ethiopic",                                        0x001200, 0x00137F },
 
{ "Ethiopic Supplement",                            0x001380, 0x00139F },
 
{ "Cherokee",                                        0x0013A0, 0x0013FF },
 
{ "Unified Canadian Aboriginal Syllabics",          0x001400, 0x00167F },
 
{ "Ogham",                                          0x001680, 0x00169F },
 
{ "Runic",                                          0x0016A0, 0x0016FF },
 
{ "Tagalog",                                        0x001700, 0x00171F },
 
{ "Hanunoo",                                        0x001720, 0x00173F },
 
{ "Buhid",                                          0x001740, 0x00175F },
 
{ "Tagbanwa",                                        0x001760, 0x00177F },
 
{ "Khmer",                                          0x001780, 0x0017FF },
 
{ "Mongolian",                                      0x001800, 0x0018AF },
 
{ "Unified Canadian Aboriginal Syllabics Extended",  0x0018B0, 0x0018FF },
 
{ "Limbu",                                          0x001900, 0x00194F },
 
{ "Tai Le",                                          0x001950, 0x00197F },
 
{ "New Tai Lue",                                    0x001980, 0x0019DF },
 
{ "Khmer Symbols",                                  0x0019E0, 0x0019FF },
 
{ "Buginese",                                        0x001A00, 0x001A1F },
 
{ "Tai Tham",                                        0x001A20, 0x001AAF },
 
{ "Combining Diacritical Marks Extended",            0x001AB0, 0x001AFF },
 
{ "Balinese",                                        0x001B00, 0x001B7F },
 
{ "Sundanese",                                      0x001B80, 0x001BBF },
 
{ "Batak",                                          0x001BC0, 0x001BFF },
 
{ "Lepcha",                                          0x001C00, 0x001C4F },
 
{ "Ol Chiki",                                        0x001C50, 0x001C7F },
 
{ "Cyrillic Extended-C",                            0x001C80, 0x001C8F },
 
{ "Georgian Extended",                               0x001C90, 0x001CBF },
 
{ "Sundanese Supplement",                            0x001CC0, 0x001CCF },
 
{ "Vedic Extensions",                                0x001CD0, 0x001CFF },
 
{ "Phonetic Extensions",                            0x001D00, 0x001D7F },
 
{ "Phonetic Extensions Supplement",                  0x001D80, 0x001DBF },
 
{ "Combining Diacritical Marks Supplement",          0x001DC0, 0x001DFF },
 
{ "Latin Extended Additional",                      0x001E00, 0x001EFF },
 
{ "Greek Extended",                                  0x001F00, 0x001FFF },
 
{ "General Punctuation",                            0x002000, 0x00206F },
 
{ "Superscripts and Subscripts",                    0x002070, 0x00209F },
 
{ "Currency Symbols",                                0x0020A0, 0x0020CF },
 
{ "Combining Diacritical Marks for Symbols",        0x0020D0, 0x0020FF },
 
{ "Letterlike Symbols",                              0x002100, 0x00214F },
 
{ "Number Forms",                                    0x002150, 0x00218F },
 
{ "Arrows",                                          0x002190, 0x0021FF },
 
{ "Mathematical Operators",                          0x002200, 0x0022FF },
 
{ "Miscellaneous Technical",                        0x002300, 0x0023FF },
 
{ "Control Pictures",                                0x002400, 0x00243F },
 
{ "Optical Character Recognition",                  0x002440, 0x00245F },
 
{ "Enclosed Alphanumerics",                          0x002460, 0x0024FF },
 
{ "Box Drawing",                                    0x002500, 0x00257F },
 
{ "Block Elements",                                  0x002580, 0x00259F },
 
{ "Geometric Shapes",                                0x0025A0, 0x0025FF },
 
{ "Miscellaneous Symbols",                          0x002600, 0x0026FF },
 
{ "Dingbats",                                        0x002700, 0x0027BF },
 
{ "Miscellaneous Mathematical Symbols-A",            0x0027C0, 0x0027EF },
 
{ "Supplemental Arrows-A",                          0x0027F0, 0x0027FF },
 
{ "Braille Patterns",                                0x002800, 0x0028FF },
 
{ "Supplemental Arrows-B",                          0x002900, 0x00297F },
 
{ "Miscellaneous Mathematical Symbols-B",            0x002980, 0x0029FF },
 
{ "Supplemental Mathematical Operators",            0x002A00, 0x002AFF },
 
{ "Miscellaneous Symbols and Arrows",                0x002B00, 0x002BFF },
 
{ "Glagolitic",                                      0x002C00, 0x002C5F },
 
{ "Latin Extended-C",                                0x002C60, 0x002C7F },
 
{ "Coptic",                                          0x002C80, 0x002CFF },
 
{ "Georgian Supplement",                            0x002D00, 0x002D2F },
 
{ "Tifinagh",                                        0x002D30, 0x002D7F },
 
{ "Ethiopic Extended",                              0x002D80, 0x002DDF },
 
{ "Cyrillic Extended-A",                            0x002DE0, 0x002DFF },
 
{ "Supplemental Punctuation",                        0x002E00, 0x002E7F },
 
{ "CJK Radicals Supplement",                        0x002E80, 0x002EFF },
 
{ "Kangxi Radicals",                                0x002F00, 0x002FDF },
 
{ "Ideographic Description Characters",              0x002FF0, 0x002FFF },
 
{ "CJK Symbols and Punctuation",                    0x003000, 0x00303F },
 
{ "Hiragana",                                        0x003040, 0x00309F },
 
{ "Katakana",                                        0x0030A0, 0x0030FF },
 
{ "Bopomofo",                                        0x003100, 0x00312F },
 
{ "Hangul Compatibility Jamo",                      0x003130, 0x00318F },
 
{ "Kanbun",                                          0x003190, 0x00319F },
 
{ "Bopomofo Extended",                              0x0031A0, 0x0031BF },
 
{ "CJK Strokes",                                    0x0031C0, 0x0031EF },
 
{ "Katakana Phonetic Extensions",                    0x0031F0, 0x0031FF },
 
{ "Enclosed CJK Letters and Months",                0x003200, 0x0032FF },
 
{ "CJK Compatibility",                              0x003300, 0x0033FF },
 
{ "CJK Unified Ideographs Extension A",              0x003400, 0x004DBF },
 
{ "Yijing Hexagram Symbols",                        0x004DC0, 0x004DFF },
 
{ "CJK Unified Ideographs",                          0x004E00, 0x009FFF },
 
{ "Yi Syllables",                                    0x00A000, 0x00A48F },
 
{ "Yi Radicals",                                    0x00A490, 0x00A4CF },
 
{ "Lisu",                                            0x00A4D0, 0x00A4FF },
 
{ "Vai",                                            0x00A500, 0x00A63F },
 
{ "Cyrillic Extended-B",                            0x00A640, 0x00A69F },
 
{ "Bamum",                                          0x00A6A0, 0x00A6FF },
 
{ "Modifier Tone Letters",                          0x00A700, 0x00A71F },
 
{ "Latin Extended-D",                                0x00A720, 0x00A7FF },
 
{ "Syloti Nagri",                                    0x00A800, 0x00A82F },
 
{ "Common Indic Number Forms",                      0x00A830, 0x00A83F },
 
{ "Phags-pa",                                        0x00A840, 0x00A87F },
 
{ "Saurashtra",                                      0x00A880, 0x00A8DF },
 
{ "Devanagari Extended",                            0x00A8E0, 0x00A8FF },
 
{ "Kayah Li",                                        0x00A900, 0x00A92F },
 
{ "Rejang",                                          0x00A930, 0x00A95F },
 
{ "Hangul Jamo Extended-A",                          0x00A960, 0x00A97F },
 
{ "Javanese",                                        0x00A980, 0x00A9DF },
 
{ "Myanmar Extended-B",                              0x00A9E0, 0x00A9FF },
 
{ "Cham",                                            0x00AA00, 0x00AA5F },
 
{ "Myanmar Extended-A",                              0x00AA60, 0x00AA7F },
 
{ "Tai Viet",                                        0x00AA80, 0x00AADF },
 
{ "Meetei Mayek Extensions",                        0x00AAE0, 0x00AAFF },
 
{ "Ethiopic Extended-A",                            0x00AB00, 0x00AB2F },
 
{ "Latin Extended-E",                                0x00AB30, 0x00AB6F },
 
{ "Cherokee Supplement",                            0x00AB70, 0x00ABBF },
 
{ "Meetei Mayek",                                    0x00ABC0, 0x00ABFF },
 
{ "Hangul Syllables",                                0x00AC00, 0x00D7AF },
 
{ "Hangul Jamo Extended-B",                          0x00D7B0, 0x00D7FF },
 
{ "High Surrogates",                                0x00D800, 0x00DB7F },
 
{ "High Private Use Surrogates",                    0x00DB80, 0x00DBFF },
 
{ "Low Surrogates",                                  0x00DC00, 0x00DFFF },
 
{ "Private Use Area",                                0x00E000, 0x00F8FF },
 
{ "CJK Compatibility Ideographs",                    0x00F900, 0x00FAFF },
 
{ "Alphabetic Presentation Forms",                  0x00FB00, 0x00FB4F },
 
{ "Arabic Presentation Forms-A",                    0x00FB50, 0x00FDFF },
 
{ "Variation Selectors",                            0x00FE00, 0x00FE0F },
 
{ "Vertical Forms",                                  0x00FE10, 0x00FE1F },
 
{ "Combining Half Marks",                            0x00FE20, 0x00FE2F },
 
{ "CJK Compatibility Forms",                        0x00FE30, 0x00FE4F },
 
{ "Small Form Variants",                            0x00FE50, 0x00FE6F },
 
{ "Arabic Presentation Forms-B",                    0x00FE70, 0x00FEFF },
 
{ "Halfwidth and Fullwidth Forms",                  0x00FF00, 0x00FFEF },
 
{ "Specials",                                        0x00FFF0, 0x00FFFF },
 
{ "Linear B Syllabary",                              0x010000, 0x01007F },
 
{ "Linear B Ideograms",                              0x010080, 0x0100FF },
 
{ "Aegean Numbers",                                  0x010100, 0x01013F },
 
{ "Ancient Greek Numbers",                          0x010140, 0x01018F },
 
{ "Ancient Symbols",                                0x010190, 0x0101CF },
 
{ "Phaistos Disc",                                  0x0101D0, 0x0101FF },
 
{ "Lycian",                                          0x010280, 0x01029F },
 
{ "Carian",                                          0x0102A0, 0x0102DF },
 
{ "Coptic Epact Numbers",                            0x0102E0, 0x0102FF },
 
{ "Old Italic",                                      0x010300, 0x01032F },
 
{ "Gothic",                                          0x010330, 0x01034F },
 
{ "Old Permic",                                      0x010350, 0x01037F },
 
{ "Ugaritic",                                        0x010380, 0x01039F },
 
{ "Old Persian",                                    0x0103A0, 0x0103DF },
 
{ "Deseret",                                        0x010400, 0x01044F },
 
{ "Shavian",                                        0x010450, 0x01047F },
 
{ "Osmanya",                                        0x010480, 0x0104AF },
 
{ "Osage",                                          0x0104B0, 0x0104FF },
 
{ "Elbasan",                                        0x010500, 0x01052F },
 
{ "Caucasian Albanian",                              0x010530, 0x01056F },
 
{ "Linear A",                                        0x010600, 0x01077F },
 
{ "Cypriot Syllabary",                              0x010800, 0x01083F },
 
{ "Imperial Aramaic",                                0x010840, 0x01085F },
 
{ "Palmyrene",                                      0x010860, 0x01087F },
 
{ "Nabataean",                                      0x010880, 0x0108AF },
 
{ "Hatran",                                          0x0108E0, 0x0108FF },
 
{ "Phoenician",                                      0x010900, 0x01091F },
 
{ "Lydian",                                          0x010920, 0x01093F },
 
{ "Meroitic Hieroglyphs",                            0x010980, 0x01099F },
 
{ "Meroitic Cursive",                                0x0109A0, 0x0109FF },
 
{ "Kharoshthi",                                      0x010A00, 0x010A5F },
 
{ "Old South Arabian",                              0x010A60, 0x010A7F },
 
{ "Old North Arabian",                              0x010A80, 0x010A9F },
 
{ "Manichaean",                                      0x010AC0, 0x010AFF },
 
{ "Avestan",                                        0x010B00, 0x010B3F },
 
{ "Inscriptional Parthian",                          0x010B40, 0x010B5F },
 
{ "Inscriptional Pahlavi",                          0x010B60, 0x010B7F },
 
{ "Psalter Pahlavi",                                0x010B80, 0x010BAF },
 
{ "Old Turkic",                                      0x010C00, 0x010C4F },
 
{ "Old Hungarian",                                  0x010C80, 0x010CFF },
 
{ "Hanifi Rohingya",                                0x010D00, 0x010D3F },
 
{ "Rumi Numeral Symbols",                            0x010E60, 0x010E7F },
 
{ "Old Sogdian",                                    0x010F00, 0x010F2F },
 
{ "Sogdian",                                        0x010F30, 0x010F6F },
 
{ "Brahmi",                                          0x011000, 0x01107F },
 
{ "Kaithi",                                          0x011080, 0x0110CF },
 
{ "Sora Sompeng",                                    0x0110D0, 0x0110FF },
 
{ "Chakma",                                          0x011100, 0x01114F },
 
{ "Mahajani",                                        0x011150, 0x01117F },
 
{ "Sharada",                                        0x011180, 0x0111DF },
 
{ "Sinhala Archaic Numbers",                        0x0111E0, 0x0111FF },
 
{ "Khojki",                                          0x011200, 0x01124F },
 
{ "Multani",                                        0x011280, 0x0112AF },
 
{ "Khudawadi",                                      0x0112B0, 0x0112FF },
 
{ "Grantha",                                        0x011300, 0x01137F },
 
{ "Newa",                                            0x011400, 0x01147F },
 
{ "Tirhuta",                                        0x011480, 0x0114DF },
 
{ "Siddham",                                        0x011580, 0x0115FF },
 
{ "Modi",                                            0x011600, 0x01165F },
 
{ "Mongolian Supplement",                            0x011660, 0x01167F },
 
{ "Takri",                                          0x011680, 0x0116CF },
 
{ "Ahom",                                            0x011700, 0x01173F },
 
{ "Dogra",                                          0x011800, 0x01184F },
 
{ "Warang Citi",                                    0x0118A0, 0x0118FF },
 
{ "Zanabazar Square",                                0x011A00, 0x011A4F },
 
{ "Soyombo",                                        0x011A50, 0x011AAF },
 
{ "Pau Cin Hau",                                    0x011AC0, 0x011AFF },
 
{ "Bhaiksuki",                                      0x011C00, 0x011C6F },
 
{ "Marchen",                                        0x011C70, 0x011CBF },
 
{ "Masaram Gondi",                                  0x011D00, 0x011D5F },
 
{ "Gunjala Gondi",                                  0x011D60, 0x011DAF },
 
{ "Makasar",                                        0x011EE0, 0x011EFF },
 
{ "Cuneiform",                                      0x012000, 0x0123FF },
 
{ "Cuneiform Numbers and Punctuation",              0x012400, 0x01247F },
 
{ "Early Dynastic Cuneiform",                        0x012480, 0x01254F },
 
{ "Egyptian Hieroglyphs",                            0x013000, 0x01342F },
 
{ "Anatolian Hieroglyphs",                          0x014400, 0x01467F },
 
{ "Bamum Supplement",                                0x016800, 0x016A3F },
 
{ "Mro",                                            0x016A40, 0x016A6F },
 
{ "Bassa Vah",                                      0x016AD0, 0x016AFF },
 
{ "Pahawh Hmong",                                    0x016B00, 0x016B8F },
 
{ "Medefaidrin",                                    0x016E40, 0x016E9F },
 
{ "Miao",                                            0x016F00, 0x016F9F },
 
{ "Ideographic Symbols and Punctuation",            0x016FE0, 0x016FFF },
 
{ "Tangut",                                          0x017000, 0x0187FF },
 
{ "Tangut Components",                              0x018800, 0x018AFF },
 
{ "Kana Supplement",                                0x01B000, 0x01B0FF },
 
{ "Kana Extended-A",                                0x01B100, 0x01B12F },
 
{ "Nushu",                                          0x01B170, 0x01B2FF },
 
{ "Duployan",                                        0x01BC00, 0x01BC9F },
 
{ "Shorthand Format Controls",                      0x01BCA0, 0x01BCAF },
 
{ "Byzantine Musical Symbols",                      0x01D000, 0x01D0FF },
 
{ "Musical Symbols",                                0x01D100, 0x01D1FF },
 
{ "Ancient Greek Musical Notation",                  0x01D200, 0x01D24F },
 
{ "Mayan Numerals",                                  0x01D2E0, 0x01D2FF },
 
{ "Tai Xuan Jing Symbols",                          0x01D300, 0x01D35F },
 
{ "Counting Rod Numerals",                          0x01D360, 0x01D37F },
 
{ "Mathematical Alphanumeric Symbols",              0x01D400, 0x01D7FF },
 
{ "Sutton SignWriting",                              0x01D800, 0x01DAAF },
 
{ "Glagolitic Supplement",                          0x01E000, 0x01E02F },
 
{ "Mende Kikakui",                                  0x01E800, 0x01E8DF },
 
{ "Adlam",                                          0x01E900, 0x01E95F },
 
{ "Indic Siyaq Numbers",                            0x01EC70, 0x01ECBF },
 
{ "Arabic Mathematical Alphabetic Symbols",          0x01EE00, 0x01EEFF },
 
{ "Mahjong Tiles",                                  0x01F000, 0x01F02F },
 
{ "Domino Tiles",                                    0x01F030, 0x01F09F },
 
{ "Playing Cards",                                  0x01F0A0, 0x01F0FF },
 
{ "Enclosed Alphanumeric Supplement",                0x01F100, 0x01F1FF },
 
{ "Enclosed Ideographic Supplement",                0x01F200, 0x01F2FF },
 
{ "Miscellaneous Symbols and Pictographs",          0x01F300, 0x01F5FF },
 
{ "Emoticons",                                      0x01F600, 0x01F64F },
 
{ "Ornamental Dingbats",                            0x01F650, 0x01F67F },
 
{ "Transport and Map Symbols",                      0x01F680, 0x01F6FF },
 
{ "Alchemical Symbols",                              0x01F700, 0x01F77F },
 
{ "Geometric Shapes Extended",                      0x01F780, 0x01F7FF },
 
{ "Supplemental Arrows-C",                          0x01F800, 0x01F8FF },
 
{ "Supplemental Symbols and Pictographs",            0x01F900, 0x01F9FF },
 
{ "Chess Symbols",                                  0x01FA00, 0x01FA6F },
 
{ "CJK Unified Ideographs Extension B",              0x020000, 0x02A6DF },
 
{ "CJK Unified Ideographs Extension C",              0x02A700, 0x02B73F },
 
{ "CJK Unified Ideographs Extension D",              0x02B740, 0x02B81F },
 
{ "CJK Unified Ideographs Extension E",              0x02B820, 0x02CEAF },
 
{ "CJK Unified Ideographs Extension F",              0x02CEB0, 0x02EBEF },
 
{ "CJK Compatibility Ideographs Supplement",        0x02F800, 0x02FA1F },
 
{ "Tags",                                            0x0E0000, 0x0E007F },
 
{ "Variation Selectors Supplement",                  0x0E0100, 0x0E01EF },
 
{ "Supplementary Private Use Area-A",                0x0F0000, 0x0FFFFF },
 
{ "Supplementary Private Use Area-B",                0x100000, 0x10FFFF },
 
}
 
blocks.length = #blocks
 
  
function export.enum_blocks()
+
-- An ipairs-type iterator generator for the list of blocks.
return function (blocks, i)
+
function p.enum_blocks()
i = i + 1
+
local blocks = loader.blocks
local data = blocks[i]
+
return block_iter, blocks, 0
if not data then
 
return nil
 
end
 
return i, unpack(data)
 
end, blocks, 0
 
 
end
 
end
  
function export.lookup_plane(codepoint)
+
function p.lookup_plane(codepoint)
 
local i = floor(codepoint / 0x10000)
 
local i = floor(codepoint / 0x10000)
 
return planes[i] or ("Plane %u"):format(i)
 
return planes[i] or ("Plane %u"):format(i)
 
end
 
end
  
-- Binary search, to avoid iterating over entire table in order to look up the
+
function p.lookup_block(codepoint)
-- higher codepoints.
+
local blocks = loader.blocks
function export.lookup_block(codepoint)
+
local range = binary_range_search(codepoint, blocks)
local iStart, iEnd = 1, blocks.length or #blocks
+
if range then
while iStart <= iEnd do
+
return range[3]
local iMid = floor((iStart + iEnd) / 2)
+
else
local range = blocks[iMid]
+
return "No Block"
if codepoint < range[2] then
 
iEnd = iMid - 1
 
elseif codepoint <= range[3] then
 
return range[1]
 
else
 
iStart = iMid + 1
 
end
 
 
end
 
end
error(string.format("No block found for codepoint U+%04X.", codepoint))
 
 
end
 
end
  
function export.get_block_range(name)
+
function p.get_block_info(name)
local range
+
for i, block in ipairs(loader.blocks) do
+
if block[3] == name then
for i, block in ipairs(blocks) do
+
return block
if block[1] == name then
 
range = block
 
 
end
 
end
end
 
 
if range then
 
return range[2], range[3]
 
 
end
 
end
 
end
 
end
  
function export.is_valid_pagename(pagename)
+
function p.is_valid_pagename(pagename)
 
local has_nonws = false
 
local has_nonws = false
  
Line 497: Line 236:
 
end
 
end
  
local printable, result = export.is_printable(cp)
+
local printable, result = p.is_printable(cp)
 
if not printable then
 
if not printable then
 
return false
 
return false
Line 511: Line 250:
  
 
local function manual_unpack(what, from)
 
local function manual_unpack(what, from)
 +
if what[from + 1] == nil then
 +
return what[from]
 +
end
 +
 
local result = {}
 
local result = {}
 
from = from or 1
 
from = from or 1
Line 521: Line 264:
 
end
 
end
  
local function memo_lookup(loader, match_func, ...)
+
local function compare_ranges(range1, range2)
 +
return range1[1] < range2[1]
 +
end
 +
 
 +
-- Creates a function to look up data in a module that contains "singles" (a
 +
-- code point-to-data map) and "ranges" (an array containing arrays that contain
 +
-- the low and high code points of a range and the data associated with that
 +
-- range).
 +
-- "loader" loads and returns the "singles" and "ranges" tables.
 +
-- "match_func" is passed the code point and either the data or the "dots", and
 +
-- generates the final result of the function.
 +
-- The varargs ("dots") describes the default data to be returned if there wasn't
 +
-- a match.
 +
-- In case the function is used more than once, "cache" saves ranges that have
 +
-- already been found to match, or a range whose data is the default if there
 +
-- was no match.
 +
local function memo_lookup(data_module_subpage, match_func, ...)
 
local dots = { ... }
 
local dots = { ... }
 
local cache = {}
 
local cache = {}
Line 528: Line 287:
 
return function (codepoint)
 
return function (codepoint)
 
if not singles then
 
if not singles then
singles, ranges = loader()
+
local data_module = loader[data_module_subpage]
 +
singles, ranges = data_module.singles, data_module.ranges
 
end
 
end
  
Line 535: Line 295:
 
end
 
end
  
local lastlast = -1
+
local range = binary_range_search(codepoint, cache)
for _, range in pairs(cache) do
+
if range then
if (range[1] <= codepoint) and (codepoint <= range[2]) then
+
return match_func(codepoint, manual_unpack(range, 3))
return match_func(codepoint, unpack(range, 3))
+
end
end
+
 +
local range, index = binary_range_search(codepoint, ranges)
 +
if range then
 +
table.insert(cache, range)
 +
table.sort(cache, compare_ranges)
 +
return match_func(codepoint, manual_unpack(range, 3))
 
end
 
end
 
+
for _, range in pairs(ranges) do
+
if ranges[index] then
if codepoint < range[1] then
+
local dots_range
table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) })
+
if codepoint > ranges[index][2] then
return match_func(codepoint, unpack(dots))
+
dots_range = {
elseif codepoint <= range[2] then
+
ranges[index][2] + 1,
table.insert(cache, { manual_unpack(range) })
+
ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,
return match_func(codepoint, manual_unpack(range, 3))
+
unpack(dots)
else
+
}
lastlast = range[2]
+
else -- codepoint < range[index][1]
 +
dots_range = {
 +
ranges[index - 1] and ranges[index - 1][2] + 1 or 0,
 +
ranges[index][1] - 1,
 +
unpack(dots)
 +
}
 
end
 
end
 +
table.sort(cache, compare_ranges)
 
end
 
end
 
+
 
return match_func(codepoint)
 
return match_func(codepoint)
 
end
 
end
 
end
 
end
  
-- Get a codepoint's combining class value in [[Module:Unicode data/combining]],
+
-- Get a code point's combining class value in [[Module:Unicode data/combining]],
 
-- and return whether this value is not zero. Zero is assigned as the default
 
-- and return whether this value is not zero. Zero is assigned as the default
 
-- if the combining class value is not found in this data module.
 
-- if the combining class value is not found in this data module.
 
-- That is, return true if character is combining, or false if it is not.
 
-- That is, return true if character is combining, or false if it is not.
-- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
+
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
 
-- more information.
 
-- more information.
export.is_combining = memo_lookup(function ()
+
p.is_combining = memo_lookup(
local m_comb = mw.loadData('Module:Unicode data/combining')
+
"combining",
return m_comb.single, m_comb.ranges
+
function (codepoint, combining_class)
end, function (codepoint, combining_class)
+
return combining_class and combining_class ~= 0 or false
return combining_class and combining_class ~= 0
+
end,
or false
+
0)
end, 0)
 
  
function export.add_dotted_circle(str)
+
function p.add_dotted_circle(str)
 
return (mw.ustring.gsub(str, ".",
 
return (mw.ustring.gsub(str, ".",
 
function(char)
 
function(char)
if export.is_combining(mw.ustring.codepoint(char)) then
+
if p.is_combining(mw.ustring.codepoint(char)) then
 
return '◌' .. char
 
return '◌' .. char
 
end
 
end
Line 581: Line 351:
 
end
 
end
  
local lookup_control = memo_lookup(function ()
+
local lookup_control = memo_lookup(
local m_cc = mw.loadData('Module:Unicode data/control')
+
"control",
return m_cc.single, m_cc.ranges
+
function (codepoint, ccc)
end, function (codepoint, ccc)
+
return ccc or "assigned"
return ccc or "assigned"
+
end,
end, "assigned")
+
"assigned")
 +
p.lookup_control = lookup_control
  
function export.is_assigned(codepoint)
+
function p.is_assigned(codepoint)
 
return lookup_control(codepoint) ~= "unassigned"
 
return lookup_control(codepoint) ~= "unassigned"
 
end
 
end
  
function export.is_printable(codepoint)
+
function p.is_printable(codepoint)
 
local result = lookup_control(codepoint)
 
local result = lookup_control(codepoint)
 
return (result == "assigned") or (result == "space-separator"), result
 
return (result == "assigned") or (result == "space-separator"), result
 
end
 
end
  
function export.is_whitespace(codepoint)
+
function p.is_whitespace(codepoint)
 
local result = lookup_control(codepoint)
 
local result = lookup_control(codepoint)
 
return (result == "space-separator"), result
 
return (result == "space-separator"), result
 
end
 
end
  
-- to be used in language-neutral context only (e.g. character lists)
+
p.lookup_category = memo_lookup(
 +
"category",
 +
function (codepoint, category)
 +
return category
 +
end,
 +
"Cn")
  
local script_pats
+
local lookup_script = memo_lookup(
 +
"scripts",
 +
function (codepoint, script_code)
 +
return script_code or 'Zzzz'
 +
end,
 +
"Zzzz")
 +
p.lookup_script = lookup_script
  
-- Scripts that consist entirely of characters from another script.
+
function p.get_best_script(str)
local script_blacklist = {
+
-- Check type of argument, because mw.text.decode coerces numbers to strings!
["Latf"] = true;
+
require "libraryUtil".checkType("get_best_script", 1, str, "string")
["Hans"] = true;
+
["Hant"] = true;
+
-- Convert HTML character references (including named character references,
["Kore"] = true;
+
-- or character entities) to characters.
["Jpan"] = true;
+
str = mw.text.decode(str, true)
["fa-Arab"] = true;
+
["kk-Arab"] = true;
+
local scripts = {}
["ks-Arab"] = true;
+
for codepoint in mw.ustring.gcodepoint(str) do
["ku-Arab"] = true;
+
local script = lookup_script(codepoint)
["mzn-Arab"] = true;
+
["ota-Arab"] = true;
+
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
["pa-Arab"] = true;
+
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
["ps-Arab"] = true;
+
scripts[script] = true
["sd-Arab"] = true;
 
["tt-Arab"] = true;
 
["ug-Arab"] = true;
 
["ur-Arab"] = true;
 
["nv-Latn"] = true;
 
["pjt-Latn"] = true;
 
["Zyyy"] = true;
 
}
 
 
 
--[[
 
Problem scripts: Grek and polytonic, Cyrl and Cyrs, Latn and Latinx.
 
In each key-value pair, the value should take precedence over the key.
 
]]
 
 
 
local overridden_by = {
 
["Cyrs"] = "Cyrl",
 
["polytonic"] = "Grek",
 
["Latinx"] = "Latn",
 
}
 
 
 
local script_cache = {}
 
 
 
function export.get_script(codepoint)
 
local text
 
if type(codepoint) == "number" then
 
text = mw.ustring.char(codepoint)
 
elseif type(codepoint) == "string" then
 
text = codepoint
 
else
 
error("Argument to get_script should be a number (codepoint) or string.")
 
end
 
 
 
for pat, sc in pairs(script_cache) do
 
if mw.ustring.match(text, pat) and not overridden_by[sc] then
 
return sc
 
 
end
 
end
 
end
 
end
 +
 +
-- If scripts does not contain two or more keys,
 +
-- return first and only key (script code) in table.
 +
if not next(scripts, next(scripts)) then
 +
return next(scripts)
 +
end -- else return majority script, or else "Zzzz"?
 +
end
  
if not script_pats then
+
function p.is_Latin(str)
local m_scripts = mw.loadData("Module:scripts/data")
+
require "libraryUtil".checkType("get_best_script", 1, str, "string")
script_pats = {}
+
str = mw.text.decode(str, true)
for sc, info in pairs(m_scripts) do
+
if info.characters and not script_blacklist[sc] then
+
-- Search for the leading bytes that introduce the UTF-8 encoding of the
script_pats[sc] = "[" .. info.characters .. "]"
+
-- code points U+0340-U+10FFFF. If they are not found and there is at least
 +
-- one Latin-script character, the string counts as Latin, because the rest
 +
-- of the characters can only be Zyyy, Zinh, and Zzzz.
 +
-- The only scripts found below U+0370 (the first code point of the Greek
 +
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
 +
-- See the codepage in the [[UTF-8]] article.
 +
if not str:find "[\205-\244]" then
 +
for codepoint in mw.ustring.gcodepoint(str) do
 +
if lookup_script(codepoint) == "Latn" then
 +
return true
 
end
 
end
 
end
 
end
 
end
 
end
 
+
for sc, pat in pairs(script_pats) do
+
local Latn = false
if mw.ustring.match(text, pat) then
+
local overriding = overridden_by[sc]
+
for codepoint in mw.ustring.gcodepoint(str) do
if overriding and script_pats[overriding] and mw.ustring.match(text, script_pats[overriding]) then
+
local script = lookup_script(codepoint)
script_cache[script_pats[overriding]] = overriding
+
return overriding
+
if script == "Latn" then
else
+
Latn = true
script_cache[pat] = sc
+
elseif not (script == "Zyyy" or script == "Zinh"
return sc
+
or script == "Zzzz") then
end
+
return false
 
end
 
end
 
end
 
end
 
+
return "None"
+
return Latn
 
end
 
end
  
local function sortRange(range1, range2)
+
-- Checks that a string contains only characters belonging to right-to-left
return range1[1] < range2[1]
+
-- scripts, or characters of ignorable scripts.
end
+
function p.is_rtl(str)
 
+
require "libraryUtil".checkType("get_best_script", 1, str, "string")
--[[
+
str = mw.text.decode(str, true)
Binary search: more efficient for the longer lists of codepoint ranges than
+
for the shorter ones.
+
-- Search for the leading bytes that introduce the UTF-8 encoding of the
]]
+
-- code points U+0580-U+10FFFF. If they are not found, the string can only
local function binary_search(ranges, value)
+
-- have characters from a left-to-right script, because the first code point
if not ranges then
+
-- in a right-to-left script is U+0591, in the Hebrew block.
return nil
+
if not str:find "[\214-\244]" then
 +
return false
 
end
 
end
-- Initialize numbers.
+
local iStart, iMid = 1, 0
+
local result = false
-- Can't use # because table is loaded by mw.loadData.
+
local rtl = loader.scripts.rtl
local iEnd = ranges.length or require("Module:table").size(ranges)
+
for codepoint in mw.ustring.gcodepoint(str) do
 
+
local script = lookup_script(codepoint)
if iEnd == 0 then
+
return nil
+
if rtl[script] then
end
+
result = true
 
+
elseif not (script == "Zyyy" or script == "Zinh"
local iterations = 0
+
or script == "Zzzz") then
 
+
return false
-- Do search.
 
while iStart <= iEnd do
 
iterations = iterations + 1
 
 
 
-- Calculate middle.
 
iMid = floor((iStart + iEnd) / 2)
 
 
 
-- Get compare value.
 
local range = ranges[iMid]
 
 
 
if range[1] > value then
 
iEnd = iMid - 1
 
 
 
-- Return matching index. Assumes there are no duplicates.
 
elseif value <= range[2] then
 
return range
 
 
 
-- Keep searching.
 
else
 
iStart = iMid + 1
 
 
end
 
end
 
end
 
end
return nil
+
 +
return result
 
end
 
end
  
local function look_up_in_order(number, ranges)
+
local function get_codepoint(args, arg)
for i, range in ipairs(ranges) do
+
local codepoint_string = args[arg]
if number < range[1] then
+
or errorf(2, "Parameter %s is required", tostring(arg))
return nil
+
local codepoint = tonumber(codepoint_string, 16)
elseif number <= range[2] then
+
or errorf(2, "Parameter %s is not a code point in hexadecimal base",
return range[3]
+
tostring(arg))
end
+
if not (0 <= codepoint and codepoint <= 0x10FFFF) then
 +
errorf(2, "code point in parameter %s out of range", tostring(arg))
 
end
 
end
 +
return codepoint
 
end
 
end
  
-- Save previously used codepoint ranges in case another character is in the
+
local function get_func(args, arg, prefix)
-- same range.
+
local suffix = args[arg]
local ranges_cache = {}
+
or errorf(2, "Parameter %s is required", tostring(arg))
 
+
suffix = mw.text.trim(suffix)
--[=[
+
local func_name = prefix .. suffix
Takes a codepoint or a character and finds the script code (if any) that is
+
local func = p[func_name]
appropriate for it based on the codepoint, using the data module
+
or errorf(2, "There is no function '%s'", func_name)
[[Module:Unicode data/scripts]]. The data module was generated from the
+
return func
patterns in [[Module:scripts/data]] using [[Module:User:Erutuon/script recognition]].
+
end
 
 
Converts the character to a codepoint. Returns a script code if the codepoint
 
is in the list of individual characters, or if it is in one of the defined
 
ranges in the 4096-character block that it belongs to, else returns "None".
 
]=]
 
function export.char_to_script(char)
 
local lookup = mw.loadData("Module:Unicode data/scripts")
 
local t = type(char)
 
local codepoint
 
if t == "string" then
 
local etc
 
codepoint, etc = mw.ustring.codepoint(char)
 
if etc then
 
error("Argument to char_to_script should be a single character.")
 
end
 
elseif t == "number" then
 
codepoint = char
 
else
 
error("Argument to char_to_script should be a string or a number, but its type is " .. t .. ".")
 
end
 
 
 
local individual_match = lookup.individual[codepoint]
 
if individual_match then
 
return individual_match
 
else
 
local script = look_up_in_order(codepoint, ranges_cache)
 
if script then
 
return script
 
end
 
 
 
local index = floor(codepoint / 0x1000)
 
 
 
script = look_up_in_order(index, lookup.blocks)
 
if script then
 
return script
 
end
 
  
local range = binary_search(lookup[index], codepoint)
+
-- This function allows any of the "lookup" functions to be invoked. The first
if range then
+
-- parameter is the word after "lookup_"; the second parameter is the code point
table.insert(ranges_cache, range)
+
-- in hexadecimal base.
table.sort(ranges_cache, sortRange)
+
function p.lookup(frame)
return range[3]
+
local func = get_func(frame.args, 1, "lookup_")
end
+
local codepoint = get_codepoint(frame.args, 2)
 +
local result = func(codepoint)
 +
if func == p.lookup_name then
 +
-- Prevent code point labels such as <control-0000> from being
 +
-- interpreted as HTML tags.
 +
result = result:gsub("<", "&lt;")
 
end
 
end
 
+
return result
return "None"
 
 
end
 
end
  
function export.find_best_script(text)
+
function p.is(frame)
local scripts = {}
+
local func = get_func(frame.args, 1, "is_")
for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do
 
local script = export.char_to_script(character)
 
scripts[script] = (scripts[script] or 0) + 1
 
end
 
 
local best_script
 
local greatest_count = 0
 
for script, count in pairs(scripts) do
 
if count > greatest_count then
 
best_script = script
 
greatest_count = count
 
end
 
end
 
 
 
return best_script
+
-- is_Latin and is_valid_pagename take strings.
end
+
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
 
+
return (func(frame.args[2]))
local unsupported_title = {
+
else -- The rest take code points.
[0x0020] = "Unsupported titles/Space";
+
local codepoint = get_codepoint(frame.args, 2)
[0x0023] = "Unsupported titles/Number sign";
+
return (func(codepoint)) -- Adjust to one result.
[0x002E] = "Unsupported titles/Full stop";
 
[0x003A] = "Unsupported titles/Colon";
 
[0x003C] = "Unsupported titles/Less than";
 
[0x003E] = "Unsupported titles/Greater than";
 
[0x005B] = "Unsupported titles/Left square bracket";
 
[0x005D] = "Unsupported titles/Right square bracket";
 
[0x005F] = "Unsupported titles/Low line";
 
[0x007B] = "Unsupported titles/Left curly bracket";
 
[0x007C] = "Unsupported titles/Vertical line";
 
[0x007D] = "Unsupported titles/Right curly bracket";
 
[0x1680] = "Unsupported titles/Ogham space";
 
[0xFFFD] = "Unsupported titles/Replacement character";
 
}
 
 
 
function export.get_entry_title(codepoint)
 
if unsupported_title[codepoint] then
 
return unsupported_title[codepoint]
 
end
 
if lookup_control(codepoint) ~= "assigned" then
 
return nil
 
 
end
 
end
return mw.ustring.char(codepoint)
 
 
end
 
end
  
return export
+
return p

Latest revision as of 14:34, 9 March 2020

Documentation for this module may be created at Module:Unicode data/doc

local p = {}

local floor = math.floor

local function errorf(level, ...)
	if type(level) == "number" then
		return error(string.format(...), level + 1)
	else -- level is actually the format string.
		return error(string.format(level, ...), 2)
	end
end

local function binary_range_search(codepoint, ranges)
	local low, mid, high
	low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
	while low <= high do
		mid = floor((low + high) / 2)
		local range = ranges[mid]
		if codepoint < range[1] then
			high = mid - 1
		elseif codepoint <= range[2] then
			return range, mid
		else
			low = mid + 1
		end
	end
	return nil, mid
end
p.binary_range_search = binary_range_search

--[[
local function linear_range_search(codepoint, ranges)
	for i, range in ipairs(ranges) do
		if range[1] <= codepoint and codepoint <= range[2] then
			return range
		end
	end
end
--]]

-- Load a module by indexing "loader" with the name of the module minus the
-- "Module:Unicode data/" part. For instance, loader.blocks returns
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be
-- returned.
local loader = setmetatable({}, {
	__index = function (self, key)
		local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
		if not success then
			data = false
		end
		self[key] = data
		return data
	end
})

-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local name_hooks = {
	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters
	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters
	{   0x3400,   0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
	{   0x4E00,   0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables
		local Hangul_data = loader.Hangul
		local syllable_index = codepoint - 0xAC00

		return ("HANGUL SYLLABLE %s%s%s"):format(
			Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
			Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
				/ Hangul_data.trail_count)],
			Hangul_data.trails[syllable_index % Hangul_data.trail_count]
		)
	end },
	-- High Surrogates, High Private Use Surrogates, Low Surrogates
	{   0xD800,   0xDFFF, "<surrogate-%04X>" },
	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use
	-- CJK Compatibility Ideographs
	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0x17000,  0x187F1, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
	{  0x18800,  0x18AF2, function (codepoint)
		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
	end },
	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
	{  0x20000,  0x2A6D6, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
	{  0x2A700,  0x2B734, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
	{  0x2A740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
	-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
	{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
	end},
	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
}
name_hooks.length = #name_hooks

local name_range_cache

local function generate_name(data, codepoint)
	if type(data) == "string" then
		return data:format(codepoint)
	else
		return data(codepoint)
	end
end

--[[
-- Checks that the code point is a number and in range.
-- Does not check whether code point is an integer.
-- Not used
local function check_codepoint(funcName, argIdx, val)
	require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
	if codepoint < 0 or 0x10FFFF < codepoint then
		errorf("Codepoint %04X out of range", codepoint)
	end
end
--]]

-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
	-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
	-- (Cn) and specifically noncharacters:
	-- https://www.unicode.org/faq/private_use.html#nonchar4
	if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
			or floor(codepoint % 0x10000) >= 0xFFFE) then
		return ("<noncharacter-%04X>"):format(codepoint)
	end

	if name_range_cache -- Check if previously used "name hook" applies to this code point.
			and codepoint >= name_range_cache[1]
			and codepoint <= name_range_cache[2] then
		return generate_name(name_range_cache[3], codepoint)
	end
	
	local range = binary_range_search(codepoint, name_hooks)
	if range then
		name_range_cache = range
		return generate_name(range[3], codepoint)
	end

	local data = loader[('names/%03X'):format(codepoint / 0x1000)]
	
	if data and data[codepoint] then
		return data[codepoint]
	
	-- Unassigned (Cn) consists of noncharacters and reserved characters.
	-- The character has been established not to be a noncharacter,
	-- and if it were assigned, its name would already been retrieved,
	-- so it must be reserved.
	else
		return ("<reserved-%04X>"):format(codepoint)
	end
end

--[[
-- No image data modules on Wikipedia yet.
function p.lookup_image(codepoint)
	local data = loader[('images/%03X'):format(codepoint / 0x1000)]
	
	if data then
		return data[codepoint]
	end
end
--]]

local planes = {
	[ 0] = "Basic Multilingual Plane";
	[ 1] = "Supplementary Multilingual Plane";
	[ 2] = "Supplementary Ideographic Plane";
	[14] = "Supplementary Special-purpose Plane";
	[15] = "Supplementary Private Use Area-A";
	[16] = "Supplementary Private Use Area-B";
}

-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
local blocks

local function block_iter(blocks, i)
	i = i + 1
	local data = blocks[i]
	if data then
		 -- Unpack doesn't work on tables loaded with mw.loadData.
		return i, data[1], data[2], data[3]
	end
end

-- An ipairs-type iterator generator for the list of blocks.
function p.enum_blocks()
	local blocks = loader.blocks
	return block_iter, blocks, 0
end

function p.lookup_plane(codepoint)
	local i = floor(codepoint / 0x10000)
	return planes[i] or ("Plane %u"):format(i)
end

function p.lookup_block(codepoint)
	local blocks = loader.blocks
	local range = binary_range_search(codepoint, blocks)
	if range then
		return range[3]
	else
		return "No Block"
	end
end

function p.get_block_info(name)
	for i, block in ipairs(loader.blocks) do
		if block[3] == name then
			return block
		end
	end
end

function p.is_valid_pagename(pagename)
	local has_nonws = false

	for cp in mw.ustring.gcodepoint(pagename) do
		if (cp == 0x0023) -- #
		or (cp == 0x005B) -- [
		or (cp == 0x005D) -- ]
		or (cp == 0x007B) -- {
		or (cp == 0x007C) -- |
		or (cp == 0x007D) -- }
		or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
		or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
		or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
		then
			return false
		end

		local printable, result = p.is_printable(cp)
		if not printable then
			return false
		end

		if result ~= "space-separator" then
			has_nonws = true
		end
	end

	return has_nonws
end

local function manual_unpack(what, from)
	if what[from + 1] == nil then
		return what[from]
	end
	
	local result = {}
	from = from or 1
	for i, item in ipairs(what) do
		if i >= from then
			table.insert(result, item)
		end
	end
	return unpack(result)
end

local function compare_ranges(range1, range2)
	return range1[1] < range2[1]
end

-- Creates a function to look up data in a module that contains "singles" (a
-- code point-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high code points of a range and the data associated with that
-- range).
-- "loader" loads and returns the "singles" and "ranges" tables.
-- "match_func" is passed the code point and either the data or the "dots", and
-- generates the final result of the function.
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function memo_lookup(data_module_subpage, match_func, ...)
	local dots = { ... }
	local cache = {}
	local singles, ranges

	return function (codepoint)
		if not singles then
			local data_module = loader[data_module_subpage]
			singles, ranges = data_module.singles, data_module.ranges
		end

		if singles[codepoint] then
			return match_func(codepoint, singles[codepoint])
		end

		local range = binary_range_search(codepoint, cache)
		if range then
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		local range, index = binary_range_search(codepoint, ranges)
		if range then
			table.insert(cache, range)
			table.sort(cache, compare_ranges)
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		if ranges[index] then
			local dots_range
			if codepoint > ranges[index][2] then
				dots_range = {
					ranges[index][2] + 1,
					ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,
					unpack(dots)
				}
			else -- codepoint < range[index][1]
				dots_range = {
					ranges[index - 1] and ranges[index - 1][2] + 1 or 0,
					ranges[index][1] - 1,
					unpack(dots)
				}
			end
			table.sort(cache, compare_ranges)
		end
		
		return match_func(codepoint)
	end
end

-- Get a code point's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
p.is_combining = memo_lookup(
	"combining",
	function (codepoint, combining_class)
		return combining_class and combining_class ~= 0 or false
	end,
	0)

function p.add_dotted_circle(str)
	return (mw.ustring.gsub(str, ".",
		function(char)
			if p.is_combining(mw.ustring.codepoint(char)) then
				return '◌' .. char
			end
		end))
end

local lookup_control = memo_lookup(
	"control",
	function (codepoint, ccc)
		return ccc or "assigned"
	end,
	"assigned")
p.lookup_control = lookup_control

function p.is_assigned(codepoint)
	return lookup_control(codepoint) ~= "unassigned"
end

function p.is_printable(codepoint)
	local result = lookup_control(codepoint)
	return (result == "assigned") or (result == "space-separator"), result
end

function p.is_whitespace(codepoint)
	local result = lookup_control(codepoint)
	return (result == "space-separator"), result
end

p.lookup_category = memo_lookup(
	"category",
	function (codepoint, category)
		return category
	end,
	"Cn")

local lookup_script = memo_lookup(
	"scripts",
	function (codepoint, script_code)
		return script_code or 'Zzzz'
	end,
	"Zzzz")
p.lookup_script = lookup_script

function p.get_best_script(str)
	-- Check type of argument, because mw.text.decode coerces numbers to strings!
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	
	-- Convert HTML character references (including named character references,
	-- or character entities) to characters.
	str = mw.text.decode(str, true)
	
	local scripts = {}
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
		if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
			scripts[script] = true
		end
	end
	
	-- If scripts does not contain two or more keys,
	-- return first and only key (script code) in table.
	if not next(scripts, next(scripts)) then
		return next(scripts)
	end -- else return majority script, or else "Zzzz"?
end

function p.is_Latin(str)
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	str = mw.text.decode(str, true)
	
	-- Search for the leading bytes that introduce the UTF-8 encoding of the
	-- code points U+0340-U+10FFFF. If they are not found and there is at least
	-- one Latin-script character, the string counts as Latin, because the rest
	-- of the characters can only be Zyyy, Zinh, and Zzzz.
	-- The only scripts found below U+0370 (the first code point of the Greek
	-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
	-- See the codepage in the [[UTF-8]] article.
	if not str:find "[\205-\244]" then
		for codepoint in mw.ustring.gcodepoint(str) do
			if lookup_script(codepoint) == "Latn" then
				return true
			end
		end
	end
	
	local Latn = false
	
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		if script == "Latn" then
			Latn = true
		elseif not (script == "Zyyy" or script == "Zinh"
				or script == "Zzzz") then
			return false
		end
	end
	
	return Latn
end

-- Checks that a string contains only characters belonging to right-to-left
-- scripts, or characters of ignorable scripts.
function p.is_rtl(str)
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	str = mw.text.decode(str, true)
	
	-- Search for the leading bytes that introduce the UTF-8 encoding of the
	-- code points U+0580-U+10FFFF. If they are not found, the string can only
	-- have characters from a left-to-right script, because the first code point
	-- in a right-to-left script is U+0591, in the Hebrew block.
	if not str:find "[\214-\244]" then
		return false
	end
	
	local result = false
	local rtl = loader.scripts.rtl
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		if rtl[script] then
			result = true
		elseif not (script == "Zyyy" or script == "Zinh"
				or script == "Zzzz") then
			return false
		end
	end
	
	return result
end

local function get_codepoint(args, arg)
	local codepoint_string = args[arg]
		or errorf(2, "Parameter %s is required", tostring(arg))
	local codepoint = tonumber(codepoint_string, 16)
		or errorf(2, "Parameter %s is not a code point in hexadecimal base",
			tostring(arg))
	if not (0 <= codepoint and codepoint <= 0x10FFFF) then
		errorf(2, "code point in parameter %s out of range", tostring(arg))
	end
	return codepoint
end

local function get_func(args, arg, prefix)
	local suffix = args[arg]
		or errorf(2, "Parameter %s is required", tostring(arg))
	suffix = mw.text.trim(suffix)
	local func_name = prefix .. suffix
	local func = p[func_name]
		or errorf(2, "There is no function '%s'", func_name)
	return func
end

-- This function allows any of the "lookup" functions to be invoked. The first
-- parameter is the word after "lookup_"; the second parameter is the code point
-- in hexadecimal base.
function p.lookup(frame)
	local func = get_func(frame.args, 1, "lookup_")
	local codepoint = get_codepoint(frame.args, 2)
	local result = func(codepoint)
	if func == p.lookup_name then
		-- Prevent code point labels such as <control-0000> from being
		-- interpreted as HTML tags.
		result = result:gsub("<", "&lt;")
	end
	return result
end

function p.is(frame)
	local func = get_func(frame.args, 1, "is_")
	
	-- is_Latin and is_valid_pagename take strings.
	if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
		return (func(frame.args[2]))
	else -- The rest take code points.
		local codepoint = get_codepoint(frame.args, 2)
		return (func(codepoint)) -- Adjust to one result.
	end
end

return p