Module:Unicode data
Revision as of 21:18, 6 July 2018 by imported>Erutuon ("code point" is the official spelling in the Unicode documents)
Documentation for this module may be created at Module:Unicode data/doc
local p = {} local floor = math.floor local function errorf(level, ...) if type(level) == number then return error(string.format(...), level + 1) else -- level is actually the format string. return error(string.format(level, ...), 2) end end -- For the algorithm used to generate Hangul Syllable names, -- see "Hangul Syllable Name Generation" in section 3.12 of the -- Unicode Specification: -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf local Hangul_data -- loaded if needed local name_hooks = { { 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters { 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters { 0x3400, 0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A { 0x4E00, 0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph { 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables Hangul_data = Hangul_data or mw.loadData("Module:Unicode data/Hangul") local syllable_index = codepoint - 0xAC00 return ("HANGUL SYLLABLE %s%s%s"):format( Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)], Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count) / Hangul_data.trail_count)], Hangul_data.trails[syllable_index % Hangul_data.trail_count] ) end }, -- High Surrogates, High Private Use Surrogates, Low Surrogates { 0xD800, 0xDFFF, "<surrogate-%04X>" }, { 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use { 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, -- CJK Compatibility Ideographs { 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, { 0x17000, 0x187F1, "TANGUT IDEOGRAPH-%04X" }, -- Tangut { 0x18800, 0x18AF2, function (codepoint) return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) end }, { 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu { 0x20000, 0x2A6D6, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B { 0x2A700, 0x2B734, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C { 0x2A740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D { 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E { 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) { 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, { 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) end}, { 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use { 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use } name_hooks.length = #name_hooks local function binary_range_search(codepoint, ranges) local low, mid, high low, high = 1, ranges.length or require "Module:Table".length(ranges) while low <= high do mid = floor((low + high) / 2) local range = ranges[mid] if codepoint < range[1] then high = mid - 1 elseif codepoint <= range[2] then return range, mid else low = mid + 1 end end return nil, mid end local function linear_range_search(codepoint, ranges) for i, range in ipairs(ranges) do if range[1] <= codepoint and codepoint <= range[2] then return range end end end local name_range_cache local function generate_name(data, codepoint) if type(data) == "string" then return data:format(codepoint) else return data(codepoint) end end -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 function p.lookup_name(codepoint) require 'libraryUtil'.checkType('lookup_name', 1, codepoint, 'number') if codepoint < 0 or 0x10FFFF < codepoint then errorf("Code point %04X out of range", codepoint) end -- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned -- (Cn) and specifically noncharacters: -- https://www.unicode.org/faq/private_use.html#nonchar4 if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF or floor(codepoint % 0x10000) >= 0xFFFE) then return ("<noncharacter-%04X>"):format(codepoint) end if name_range_cache -- Check if previously used "name hook" applies to this code point. and codepoint >= name_range_cache[1] and codepoint <= name_range_cache[2] then return generate_name(name_range_cache[3], codepoint) end local range = binary_range_search(codepoint, name_hooks) if range then name_range_cache = range return generate_name(range[3], codepoint) end local success, data = pcall(mw.loadData, ('Module:Unicode data/names/%03X'):format(codepoint / 0x1000)) if success and data[codepoint] then return data[codepoint] -- Unassigned (Cn) consists of noncharacters and reserved characters. -- The character has been established not to be a noncharacter, -- and if it were assigned, its name would already been retrieved, -- so it must be reserved. else return ("<reserved-%04X>"):format(codepoint) end end function p.lookup_image(codepoint) local success, data = pcall(mw.loadData, ('Module:Unicode data/images/%03X'):format(codepoint / 0x1000) ) if success then return data[codepoint] end end function p.template_lookup_name(frame) local param = frame.args[1] or frame:getParent().args[1] local codepoint = tonumber(param, 16) if not codepoint then errorf("Expected a code point in hexadecimal base, got '%s'", param) end local name = p.lookup_name(codepoint):gsub("<", "<") return name end local planes = { [ 0] = "Basic Multilingual Plane"; [ 1] = "Supplementary Multilingual Plane"; [ 2] = "Supplementary Ideographic Plane"; [13] = "Supplementary Special-purpose Plane"; [14] = "Supplementary Private Use Area-A"; [15] = "Supplementary Private Use Area-B"; } -- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable. local blocks local function block_iter(blocks, i) i = i + 1 local data = blocks[i] if data then return i, unpack(data) end end -- An ipairs-type iterator generator for the list of blocks. function p.enum_blocks() blocks = blocks or mw.loadData("Module:Unicode data/blocks") return block_iter, blocks, 0 end function p.lookup_plane(codepoint) local i = floor(codepoint / 0x10000) return planes[i] or ("Plane %u"):format(i) end function p.lookup_block(codepoint) blocks = blocks or mw.loadData("Module:Unicode data/blocks") local range = binary_range_search(codepoint, blocks) if range then return range[3] else return "No Block" end end function p.get_block_info(name) blocks = blocks or mw.loadData("Module:Unicode data/blocks") for i, block in ipairs(blocks) do if block[3] == name then return block end end end function p.is_valid_pagename(pagename) local has_nonws = false for cp in mw.ustring.gcodepoint(pagename) do if (cp == 0x0023) -- # or (cp == 0x005B) -- [ or (cp == 0x005D) -- ] or (cp == 0x007B) -- { or (cp == 0x007C) -- | or (cp == 0x007D) -- } or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block or (cp == 0xFFFD) -- REPLACEMENT CHARACTER then return false end local printable, result = p.is_printable(cp) if not printable then return false end if result ~= "space-separator" then has_nonws = true end end return has_nonws end local function manual_unpack(what, from) local result = {} from = from or 1 for i, item in ipairs(what) do if i >= from then table.insert(result, item) end end return unpack(result) end -- Creates a function to look up data in a module that contains "singles" (a -- code point-to-data map) and "ranges" (an array containing arrays that contain -- the low and high code points of a range and the data associated with that -- range). -- "loader" loads and returns the "singles" and "ranges" tables. -- "match_func" is passed the code point and either the data or the "dots", and -- generates the final result of the function. -- The varargs ("dots") describes the default data to be returned if there wasn't -- a match. -- In case the function is used more than once, "cache" saves ranges that have -- already been found to match, or a range whose data is the default if there -- was no match. local function memo_lookup(loader, match_func, ...) local dots = { ... } local cache = {} local singles, ranges return function (codepoint) if not singles then singles, ranges = loader() end if singles[codepoint] then return match_func(codepoint, singles[codepoint]) end local lastlast = -1 local range = linear_range_search(codepoint, cache) if range then return match_func(codepoint, unpack(range, 3)) end local range, index = binary_range_search(codepoint, ranges) if range then table.insert(cache, { manual_unpack(range) }) return match_func(codepoint, manual_unpack(range, 3)) end if ranges[index] then local dots_range if codepoint > ranges[index][2] then dots_range = { ranges[index][2] + 1, ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF, unpack(dots) } else -- codepoint < range[index][1] dots_range = { ranges[index - 1] and ranges[index - 1][2] + 1 or 0, ranges[index][1] - 1, unpack(dots) } end table.insert(cache, dots_range) end return match_func(codepoint) end end -- Get a code point's combining class value in [[Module:Unicode data/combining]], -- and return whether this value is not zero. Zero is assigned as the default -- if the combining class value is not found in this data module. -- That is, return true if character is combining, or false if it is not. -- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for -- more information. p.is_combining = memo_lookup(function () local m_comb = mw.loadData('Module:Unicode data/combining') return m_comb.singles, m_comb.ranges end, function (codepoint, combining_class) return combining_class and combining_class ~= 0 or false end, 0) function p.add_dotted_circle(str) return (mw.ustring.gsub(str, ".", function(char) if p.is_combining(mw.ustring.codepoint(char)) then return '◌' .. char end end)) end local lookup_control = memo_lookup(function () local m_cc = mw.loadData('Module:Unicode data/control') return m_cc.singles, m_cc.ranges end, function (codepoint, ccc) return ccc or "assigned" end, "assigned") function p.is_assigned(codepoint) return lookup_control(codepoint) ~= "unassigned" end function p.is_printable(codepoint) local result = lookup_control(codepoint) return (result == "assigned") or (result == "space-separator"), result end function p.is_whitespace(codepoint) local result = lookup_control(codepoint) return (result == "space-separator"), result end p.lookup_category = memo_lookup( function () local category_data = mw.loadData "Module:Unicode data/category" return category_data.singles, category_data.ranges end, function (codepoint, category) return category end, "Cn") local script_range_cache local script_data local function lookup_script(codepoint) script_data = script_data or mw.loadData("Module:Unicode data/scripts") if script_data.singles[codepoint] then return script_data.singles[codepoint] end if script_range_cache then local range = linear_range_search(codepoint, script_range_cache) if range then return range[3] end end local range = binary_range_search(codepoint, script_data.ranges) if range then script_range_cache = script_range_cache or {} table.insert(script_range_cache, range) return range[3] end return "Zzzz" end p.lookup_script = lookup_script function p.get_best_script(str) local scripts = {} for codepoint in mw.ustring.gcodepoint(str) do local script = lookup_script(codepoint) -- Ignore "Inherited", "Undetermined", or "Uncoded" scripts. if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then scripts[script] = true end end -- If scripts does not contain two or more keys, -- return first and only key (script code) in table. if not next(scripts, next(scripts)) then return next(scripts) end -- else return majority script, or else "Zzzz"? end function p.is_Latin(str) -- Search for the leading bytes that introduce the UTF-8 encoding of the -- code points U+0340-U+10FFFF. See the codepage in the [[UTF-8]] article. -- If they are not found, then there is no need to test the scripts in the -- string, because the only scripts found below U+0370 (the first code point -- of the Greek and Coptic block) are Latn, Zyyy, Zinh, and Zzzz. if not str:find "[\205-\244]" then return true end for codepoint in mw.ustring.gcodepoint(str) do local script = lookup_script(codepoint) if not (script == "Latn" or script == "Zyyy" or script == "Zinh" or script == "Zzzz") then return false end end return true end return p