Difference between revisions of "Module:Unicode data"
Jump to navigation
Jump to search
imported>Erutuon (char_to_script intended for Wiktionary script classes, which are not used on Wikipedia; but perhaps Module:Language/scripts should be moved here) |
imported>Erutuon (moved block data to Module:Unicode data/blocks) |
||
Line 145: | Line 145: | ||
} | } | ||
− | + | local blocks | |
− | |||
− | local | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
function export.enum_blocks() | function export.enum_blocks() | ||
+ | blocks = blocks or mw.loadData("Module:Unicode data/blocks") | ||
return function (blocks, i) | return function (blocks, i) | ||
i = i + 1 | i = i + 1 | ||
Line 461: | Line 167: | ||
-- higher codepoints. | -- higher codepoints. | ||
function export.lookup_block(codepoint) | function export.lookup_block(codepoint) | ||
+ | blocks = blocks or mw.loadData("Module:Unicode data/blocks") | ||
local iStart, iEnd = 1, blocks.length or #blocks | local iStart, iEnd = 1, blocks.length or #blocks | ||
while iStart <= iEnd do | while iStart <= iEnd do | ||
Line 478: | Line 185: | ||
function export.get_block_range(name) | function export.get_block_range(name) | ||
local range | local range | ||
+ | blocks = blocks or mw.loadData("Module:Unicode data/blocks") | ||
for i, block in ipairs(blocks) do | for i, block in ipairs(blocks) do |
Revision as of 05:45, 23 June 2018
Documentation for this module may be created at Module:Unicode data/doc
local export = {} local floor = math.floor -- The following leads, vowels, and trails come from here: -- http://www.unicode.org/Public/UNIDATA/Jamo.txt -- For the algorithm used to generate Hangul Syllable names, -- see "Hangul Syllable Name Generation" in section 3.12 of the -- Unicode Specification: -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf local hangul_leads = { [0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" } -- not needed: -- hangul_leads.length = #hangul_leads + 1 local hangul_vowels = { [0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I" } hangul_vowel_count = #hangul_vowels + 1 local hangul_trails = { [0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" } hangul_trail_count = #hangul_trails + 1 hangul_coda_count = hangul_vowel_count * hangul_trail_count local name_hooks = { { 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters { 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters { 0x3400, 0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A { 0x4E00, 0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph --change v10 { 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables local syllable_index = codepoint - 0xAC00 return ("HANGUL SYLLABLE %s%s%s"):format( hangul_leads[floor(syllable_index / hangul_coda_count)], hangul_vowels[floor((syllable_index % hangul_coda_count) / hangul_trail_count)], hangul_trails[syllable_index % hangul_trail_count] ) end }, -- Non Private Use High Surrogate, Private Use High Surrogate, Low Surrogate { 0xD800, 0xDFFF, "<surrogate-%04X>" }, { 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use { 0x17000, 0x187F1, "TANGUT IDEOGRAPH-%05X" }, -- Tangut { 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%05X" }, -- Nushu --add v10 { 0x20000, 0x2A6D6, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension B { 0x2A700, 0x2B734, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension C { 0x2A740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension D { 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension E { 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension F -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) { 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%05X" }, { 0xF0000, 0xFFFFD, "<private-use-%05X>" }, -- Plane 15 Private Use { 0x100000, 0x10FFFD, "<private-use-%06X>" } -- Plane 16 Private Use } local name_range_cache local function generate_name(data, codepoint) if type(data) == "string" then return data:format(codepoint) else return data(codepoint) end end -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 function export.lookup_name(codepoint) if codepoint < 0 or 0x10FFFF < codepoint then error(("Codepoint %04X out of range"):format(codepoint)) end -- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are Unassigned -- (Cn) and specifically noncharacters: -- https://www.unicode.org/faq/private_use.html#nonchar4 if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF or floor(codepoint % 0x10000) >= 0xFFFE) then return ("<noncharacter-%04X>"):format(codepoint) end if name_range_cache -- Check if previously used "name hook" applies to this codepoint. and codepoint >= name_range_cache[1] and codepoint <= name_range_cache[2] then return generate_name(name_range_cache[3], codepoint) end for _, item in ipairs(name_hooks) do if codepoint < item[1] then break elseif codepoint <= item[2] then -- Save "name hook" in case another character -- from the same range will be looked up in the same module invocation. name_range_cache = item return generate_name(item[3], codepoint) end end local success, data = pcall(mw.loadData, ('Module:Unicode data/names/%03X'):format(codepoint / 0x1000)) if success and data[codepoint] then return data[codepoint] -- Unassigned (Cn) consists of noncharacters and reserved characters. -- The character has been established not to be a noncharacter, -- and if it were assigned, its name would already been retrieved, -- so it must be reserved. else return ("<reserved-%04X>"):format(codepoint) end end function export.lookup_image(codepoint) local success, data = pcall(mw.loadData, ('Module:Unicode data/images/%03X'):format(codepoint / 0x1000) ) if success then return data[codepoint] end end function export.template_lookup_name(frame) local codepoint = tonumber(frame.args[1] or frame:getParent().args[1]) local name = export.lookup_name(codepoint) return name:gsub("<", "<") end local planes = { [ 0] = "Basic Multilingual Plane"; [ 1] = "Supplementary Multilingual Plane"; [ 2] = "Supplementary Ideographic Plane"; [13] = "Supplementary Special-purpose Plane"; [14] = "Supplementary Private Use Area-A"; [15] = "Supplementary Private Use Area-B"; } local blocks function export.enum_blocks() blocks = blocks or mw.loadData("Module:Unicode data/blocks") return function (blocks, i) i = i + 1 local data = blocks[i] if not data then return nil end return i, unpack(data) end, blocks, 0 end function export.lookup_plane(codepoint) local i = floor(codepoint / 0x10000) return planes[i] or ("Plane %u"):format(i) end -- Binary search, to avoid iterating over entire table in order to look up the -- higher codepoints. function export.lookup_block(codepoint) blocks = blocks or mw.loadData("Module:Unicode data/blocks") local iStart, iEnd = 1, blocks.length or #blocks while iStart <= iEnd do local iMid = floor((iStart + iEnd) / 2) local range = blocks[iMid] if codepoint < range[2] then iEnd = iMid - 1 elseif codepoint <= range[3] then return range[1] else iStart = iMid + 1 end end error(string.format("No block found for codepoint U+%04X.", codepoint)) end function export.get_block_range(name) local range blocks = blocks or mw.loadData("Module:Unicode data/blocks") for i, block in ipairs(blocks) do if block[1] == name then range = block end end if range then return range[2], range[3] end end function export.is_valid_pagename(pagename) local has_nonws = false for cp in mw.ustring.gcodepoint(pagename) do if (cp == 0x0023) -- # or (cp == 0x005B) -- [ or (cp == 0x005D) -- ] or (cp == 0x007B) -- { or (cp == 0x007C) -- | or (cp == 0x007D) -- } or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block or (cp == 0xFFFD) -- REPLACEMENT CHARACTER then return false end local printable, result = export.is_printable(cp) if not printable then return false end if result ~= "space-separator" then has_nonws = true end end return has_nonws end local function manual_unpack(what, from) local result = {} from = from or 1 for i, item in ipairs(what) do if i >= from then table.insert(result, item) end end return unpack(result) end local function memo_lookup(loader, match_func, ...) local dots = { ... } local cache = {} local singles, ranges return function (codepoint) if not singles then singles, ranges = loader() end if singles[codepoint] then return match_func(codepoint, singles[codepoint]) end local lastlast = -1 for _, range in pairs(cache) do if (range[1] <= codepoint) and (codepoint <= range[2]) then return match_func(codepoint, unpack(range, 3)) end end for _, range in pairs(ranges) do if codepoint < range[1] then table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) }) return match_func(codepoint, unpack(dots)) elseif codepoint <= range[2] then table.insert(cache, { manual_unpack(range) }) return match_func(codepoint, manual_unpack(range, 3)) else lastlast = range[2] end end return match_func(codepoint) end end -- Get a codepoint's combining class value in [[Module:Unicode data/combining]], -- and return whether this value is not zero. Zero is assigned as the default -- if the combining class value is not found in this data module. -- That is, return true if character is combining, or false if it is not. -- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for -- more information. export.is_combining = memo_lookup(function () local m_comb = mw.loadData('Module:Unicode data/combining') return m_comb.single, m_comb.ranges end, function (codepoint, combining_class) return combining_class and combining_class ~= 0 or false end, 0) function export.add_dotted_circle(str) return (mw.ustring.gsub(str, ".", function(char) if export.is_combining(mw.ustring.codepoint(char)) then return '◌' .. char end end)) end local lookup_control = memo_lookup(function () local m_cc = mw.loadData('Module:Unicode data/control') return m_cc.single, m_cc.ranges end, function (codepoint, ccc) return ccc or "assigned" end, "assigned") function export.is_assigned(codepoint) return lookup_control(codepoint) ~= "unassigned" end function export.is_printable(codepoint) local result = lookup_control(codepoint) return (result == "assigned") or (result == "space-separator"), result end function export.is_whitespace(codepoint) local result = lookup_control(codepoint) return (result == "space-separator"), result end local unsupported_title = { [0x0020] = "Unsupported titles/Space"; [0x0023] = "Unsupported titles/Number sign"; [0x002E] = "Unsupported titles/Full stop"; [0x003A] = "Unsupported titles/Colon"; [0x003C] = "Unsupported titles/Less than"; [0x003E] = "Unsupported titles/Greater than"; [0x005B] = "Unsupported titles/Left square bracket"; [0x005D] = "Unsupported titles/Right square bracket"; [0x005F] = "Unsupported titles/Low line"; [0x007B] = "Unsupported titles/Left curly bracket"; [0x007C] = "Unsupported titles/Vertical line"; [0x007D] = "Unsupported titles/Right curly bracket"; [0x1680] = "Unsupported titles/Ogham space"; [0xFFFD] = "Unsupported titles/Replacement character"; } function export.get_entry_title(codepoint) if unsupported_title[codepoint] then return unsupported_title[codepoint] end if lookup_control(codepoint) ~= "assigned" then return nil end return mw.ustring.char(codepoint) end return export