Module:Unicode data
Revision as of 01:45, 23 June 2018 by imported>Erutuon (copied from wikt:Module:Unicode data)
Documentation for this module may be created at Module:Unicode data/doc
local export = {} local floor = math.floor -- http://www.unicode.org/Public/UNIDATA/Jamo.txt -- For the algorithm used here, see Hangul Syllable Name Generation -- in section 3.12 of the Unicode Specification. -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf local hangul_leads = { [0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" } local hangul_vowels = { [0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I" } local hangul_trails = { [0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" } local name_hooks = { { 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters { 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters { 0x3400, 0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A { 0x4E00, 0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph --change v10 { 0xAC00, 0xD7A3, function (codepoint) local m_hangul = require('Module:ko-hangul') -- lead index, vowel index, trail index local li, vi, ti = m_hangul.syllableIndex2JamoIndices( codepoint - 0xAC00 ) return ("HANGUL SYLLABLE %s%s%s"):format( hangul_leads[li], -- I hate one-based indexing hangul_vowels[vi], hangul_trails[ti] -- never mind, I can live with it ) end }, { 0xD800, 0xDB7F, "<surrogate-%04X>" }, -- Non Private Use High Surrogate { 0xDB80, 0xDBFF, "<surrogate-%04X>" }, -- Private Use High Surrogate { 0xDC00, 0xDFFF, "<surrogate-%04X>" }, -- Low Surrogate { 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use { 0x17000, 0x187F1, "TANGUT IDEOGRAPH-%05X" }, -- Tangut { 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%05X" }, -- Nushu --add v10 { 0x20000, 0x2A6D6, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension B { 0x2A700, 0x2B734, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension C { 0x2A740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension D { 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension E { 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension F --add v10 { 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%05X" }, -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) { 0xF0000, 0xFFFFD, "<private-use-%05X>" }, -- Plane 15 Private Use { 0x100000, 0x10FFFD, "<private-use-%06X>" } -- Plane 16 Private Use } local name_range_cache local function generate_name(data, codepoint) if type(data) == "string" then return data:format(codepoint) else return data(codepoint) end end -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 function export.lookup_name(codepoint) -- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are noncharacters: -- https://www.unicode.org/faq/private_use.html#nonchar4 if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF or math.floor(codepoint % 0x10000) >= 0xFFFE) then return ("<noncharacter-%04X>"):format(codepoint) end if name_range_cache then if (codepoint >= name_range_cache[1]) and (codepoint <= name_range_cache[2]) then return generate_name(name_range_cache[3], codepoint) end end for _, item in ipairs(name_hooks) do if codepoint < item[1] then break elseif codepoint <= item[2] then name_range_cache = item return generate_name(item[3], codepoint) end end local success, data = pcall(mw.loadData, ('Module:Unicode data/names/%03X'):format(codepoint / 0x1000)) if success and data[codepoint] then return data[codepoint] -- Unassigned (Cn) includes noncharacters and reserved characters. -- Codepoint has already been determined not to be a noncharacter, -- so if it is unassigned (Cn), it is reserved. elseif not export.is_assigned(codepoint) then return ("<reserved-%04X>"):format(codepoint) else require("Module:debug").track("Unicode data/no name or label") return ("<U-%04X>"):format(codepoint) -- This point should not be reached. end end function export.lookup_image(codepoint) local success, data = pcall(mw.loadData, ('Module:Unicode data/images/%03X'):format(codepoint / 0x1000) ) if success then return data[codepoint] end end function export.template_lookup_name(frame) local codepoint = tonumber(frame.args[1] or frame:getParent().args[1]) local name = export.lookup_name(codepoint) return name:gsub("<", "<") end local planes = { [ 0] = "Basic Multilingual Plane"; [ 1] = "Supplementary Multilingual Plane"; [ 2] = "Supplementary Ideographic Plane"; [13] = "Supplementary Special-purpose Plane"; [14] = "Supplementary Private Use Area-A"; [15] = "Supplementary Private Use Area-B"; } -- http://www.unicode.org/Public/UNIDATA/Blocks.txt -- This should be kept synchronized with [[Module:category tree/scriptcatboiler/blocks]]. local blocks = { { "Basic Latin", 0x000000, 0x00007F }, { "Latin-1 Supplement", 0x000080, 0x0000FF }, { "Latin Extended-A", 0x000100, 0x00017F }, { "Latin Extended-B", 0x000180, 0x00024F }, { "IPA Extensions", 0x000250, 0x0002AF }, { "Spacing Modifier Letters", 0x0002B0, 0x0002FF }, { "Combining Diacritical Marks", 0x000300, 0x00036F }, { "Greek and Coptic", 0x000370, 0x0003FF }, { "Cyrillic", 0x000400, 0x0004FF }, { "Cyrillic Supplement", 0x000500, 0x00052F }, { "Armenian", 0x000530, 0x00058F }, { "Hebrew", 0x000590, 0x0005FF }, { "Arabic", 0x000600, 0x0006FF }, { "Syriac", 0x000700, 0x00074F }, { "Arabic Supplement", 0x000750, 0x00077F }, { "Thaana", 0x000780, 0x0007BF }, { "NKo", 0x0007C0, 0x0007FF }, { "Samaritan", 0x000800, 0x00083F }, { "Mandaic", 0x000840, 0x00085F }, { "Syriac Supplement", 0x000860, 0x00086F }, { "Arabic Extended-A", 0x0008A0, 0x0008FF }, { "Devanagari", 0x000900, 0x00097F }, { "Bengali", 0x000980, 0x0009FF }, { "Gurmukhi", 0x000A00, 0x000A7F }, { "Gujarati", 0x000A80, 0x000AFF }, { "Oriya", 0x000B00, 0x000B7F }, { "Tamil", 0x000B80, 0x000BFF }, { "Telugu", 0x000C00, 0x000C7F }, { "Kannada", 0x000C80, 0x000CFF }, { "Malayalam", 0x000D00, 0x000D7F }, { "Sinhala", 0x000D80, 0x000DFF }, { "Thai", 0x000E00, 0x000E7F }, { "Lao", 0x000E80, 0x000EFF }, { "Tibetan", 0x000F00, 0x000FFF }, { "Myanmar", 0x001000, 0x00109F }, { "Georgian", 0x0010A0, 0x0010FF }, { "Hangul Jamo", 0x001100, 0x0011FF }, { "Ethiopic", 0x001200, 0x00137F }, { "Ethiopic Supplement", 0x001380, 0x00139F }, { "Cherokee", 0x0013A0, 0x0013FF }, { "Unified Canadian Aboriginal Syllabics", 0x001400, 0x00167F }, { "Ogham", 0x001680, 0x00169F }, { "Runic", 0x0016A0, 0x0016FF }, { "Tagalog", 0x001700, 0x00171F }, { "Hanunoo", 0x001720, 0x00173F }, { "Buhid", 0x001740, 0x00175F }, { "Tagbanwa", 0x001760, 0x00177F }, { "Khmer", 0x001780, 0x0017FF }, { "Mongolian", 0x001800, 0x0018AF }, { "Unified Canadian Aboriginal Syllabics Extended", 0x0018B0, 0x0018FF }, { "Limbu", 0x001900, 0x00194F }, { "Tai Le", 0x001950, 0x00197F }, { "New Tai Lue", 0x001980, 0x0019DF }, { "Khmer Symbols", 0x0019E0, 0x0019FF }, { "Buginese", 0x001A00, 0x001A1F }, { "Tai Tham", 0x001A20, 0x001AAF }, { "Combining Diacritical Marks Extended", 0x001AB0, 0x001AFF }, { "Balinese", 0x001B00, 0x001B7F }, { "Sundanese", 0x001B80, 0x001BBF }, { "Batak", 0x001BC0, 0x001BFF }, { "Lepcha", 0x001C00, 0x001C4F }, { "Ol Chiki", 0x001C50, 0x001C7F }, { "Cyrillic Extended-C", 0x001C80, 0x001C8F }, { "Georgian Extended", 0x001C90, 0x001CBF }, { "Sundanese Supplement", 0x001CC0, 0x001CCF }, { "Vedic Extensions", 0x001CD0, 0x001CFF }, { "Phonetic Extensions", 0x001D00, 0x001D7F }, { "Phonetic Extensions Supplement", 0x001D80, 0x001DBF }, { "Combining Diacritical Marks Supplement", 0x001DC0, 0x001DFF }, { "Latin Extended Additional", 0x001E00, 0x001EFF }, { "Greek Extended", 0x001F00, 0x001FFF }, { "General Punctuation", 0x002000, 0x00206F }, { "Superscripts and Subscripts", 0x002070, 0x00209F }, { "Currency Symbols", 0x0020A0, 0x0020CF }, { "Combining Diacritical Marks for Symbols", 0x0020D0, 0x0020FF }, { "Letterlike Symbols", 0x002100, 0x00214F }, { "Number Forms", 0x002150, 0x00218F }, { "Arrows", 0x002190, 0x0021FF }, { "Mathematical Operators", 0x002200, 0x0022FF }, { "Miscellaneous Technical", 0x002300, 0x0023FF }, { "Control Pictures", 0x002400, 0x00243F }, { "Optical Character Recognition", 0x002440, 0x00245F }, { "Enclosed Alphanumerics", 0x002460, 0x0024FF }, { "Box Drawing", 0x002500, 0x00257F }, { "Block Elements", 0x002580, 0x00259F }, { "Geometric Shapes", 0x0025A0, 0x0025FF }, { "Miscellaneous Symbols", 0x002600, 0x0026FF }, { "Dingbats", 0x002700, 0x0027BF }, { "Miscellaneous Mathematical Symbols-A", 0x0027C0, 0x0027EF }, { "Supplemental Arrows-A", 0x0027F0, 0x0027FF }, { "Braille Patterns", 0x002800, 0x0028FF }, { "Supplemental Arrows-B", 0x002900, 0x00297F }, { "Miscellaneous Mathematical Symbols-B", 0x002980, 0x0029FF }, { "Supplemental Mathematical Operators", 0x002A00, 0x002AFF }, { "Miscellaneous Symbols and Arrows", 0x002B00, 0x002BFF }, { "Glagolitic", 0x002C00, 0x002C5F }, { "Latin Extended-C", 0x002C60, 0x002C7F }, { "Coptic", 0x002C80, 0x002CFF }, { "Georgian Supplement", 0x002D00, 0x002D2F }, { "Tifinagh", 0x002D30, 0x002D7F }, { "Ethiopic Extended", 0x002D80, 0x002DDF }, { "Cyrillic Extended-A", 0x002DE0, 0x002DFF }, { "Supplemental Punctuation", 0x002E00, 0x002E7F }, { "CJK Radicals Supplement", 0x002E80, 0x002EFF }, { "Kangxi Radicals", 0x002F00, 0x002FDF }, { "Ideographic Description Characters", 0x002FF0, 0x002FFF }, { "CJK Symbols and Punctuation", 0x003000, 0x00303F }, { "Hiragana", 0x003040, 0x00309F }, { "Katakana", 0x0030A0, 0x0030FF }, { "Bopomofo", 0x003100, 0x00312F }, { "Hangul Compatibility Jamo", 0x003130, 0x00318F }, { "Kanbun", 0x003190, 0x00319F }, { "Bopomofo Extended", 0x0031A0, 0x0031BF }, { "CJK Strokes", 0x0031C0, 0x0031EF }, { "Katakana Phonetic Extensions", 0x0031F0, 0x0031FF }, { "Enclosed CJK Letters and Months", 0x003200, 0x0032FF }, { "CJK Compatibility", 0x003300, 0x0033FF }, { "CJK Unified Ideographs Extension A", 0x003400, 0x004DBF }, { "Yijing Hexagram Symbols", 0x004DC0, 0x004DFF }, { "CJK Unified Ideographs", 0x004E00, 0x009FFF }, { "Yi Syllables", 0x00A000, 0x00A48F }, { "Yi Radicals", 0x00A490, 0x00A4CF }, { "Lisu", 0x00A4D0, 0x00A4FF }, { "Vai", 0x00A500, 0x00A63F }, { "Cyrillic Extended-B", 0x00A640, 0x00A69F }, { "Bamum", 0x00A6A0, 0x00A6FF }, { "Modifier Tone Letters", 0x00A700, 0x00A71F }, { "Latin Extended-D", 0x00A720, 0x00A7FF }, { "Syloti Nagri", 0x00A800, 0x00A82F }, { "Common Indic Number Forms", 0x00A830, 0x00A83F }, { "Phags-pa", 0x00A840, 0x00A87F }, { "Saurashtra", 0x00A880, 0x00A8DF }, { "Devanagari Extended", 0x00A8E0, 0x00A8FF }, { "Kayah Li", 0x00A900, 0x00A92F }, { "Rejang", 0x00A930, 0x00A95F }, { "Hangul Jamo Extended-A", 0x00A960, 0x00A97F }, { "Javanese", 0x00A980, 0x00A9DF }, { "Myanmar Extended-B", 0x00A9E0, 0x00A9FF }, { "Cham", 0x00AA00, 0x00AA5F }, { "Myanmar Extended-A", 0x00AA60, 0x00AA7F }, { "Tai Viet", 0x00AA80, 0x00AADF }, { "Meetei Mayek Extensions", 0x00AAE0, 0x00AAFF }, { "Ethiopic Extended-A", 0x00AB00, 0x00AB2F }, { "Latin Extended-E", 0x00AB30, 0x00AB6F }, { "Cherokee Supplement", 0x00AB70, 0x00ABBF }, { "Meetei Mayek", 0x00ABC0, 0x00ABFF }, { "Hangul Syllables", 0x00AC00, 0x00D7AF }, { "Hangul Jamo Extended-B", 0x00D7B0, 0x00D7FF }, { "High Surrogates", 0x00D800, 0x00DB7F }, { "High Private Use Surrogates", 0x00DB80, 0x00DBFF }, { "Low Surrogates", 0x00DC00, 0x00DFFF }, { "Private Use Area", 0x00E000, 0x00F8FF }, { "CJK Compatibility Ideographs", 0x00F900, 0x00FAFF }, { "Alphabetic Presentation Forms", 0x00FB00, 0x00FB4F }, { "Arabic Presentation Forms-A", 0x00FB50, 0x00FDFF }, { "Variation Selectors", 0x00FE00, 0x00FE0F }, { "Vertical Forms", 0x00FE10, 0x00FE1F }, { "Combining Half Marks", 0x00FE20, 0x00FE2F }, { "CJK Compatibility Forms", 0x00FE30, 0x00FE4F }, { "Small Form Variants", 0x00FE50, 0x00FE6F }, { "Arabic Presentation Forms-B", 0x00FE70, 0x00FEFF }, { "Halfwidth and Fullwidth Forms", 0x00FF00, 0x00FFEF }, { "Specials", 0x00FFF0, 0x00FFFF }, { "Linear B Syllabary", 0x010000, 0x01007F }, { "Linear B Ideograms", 0x010080, 0x0100FF }, { "Aegean Numbers", 0x010100, 0x01013F }, { "Ancient Greek Numbers", 0x010140, 0x01018F }, { "Ancient Symbols", 0x010190, 0x0101CF }, { "Phaistos Disc", 0x0101D0, 0x0101FF }, { "Lycian", 0x010280, 0x01029F }, { "Carian", 0x0102A0, 0x0102DF }, { "Coptic Epact Numbers", 0x0102E0, 0x0102FF }, { "Old Italic", 0x010300, 0x01032F }, { "Gothic", 0x010330, 0x01034F }, { "Old Permic", 0x010350, 0x01037F }, { "Ugaritic", 0x010380, 0x01039F }, { "Old Persian", 0x0103A0, 0x0103DF }, { "Deseret", 0x010400, 0x01044F }, { "Shavian", 0x010450, 0x01047F }, { "Osmanya", 0x010480, 0x0104AF }, { "Osage", 0x0104B0, 0x0104FF }, { "Elbasan", 0x010500, 0x01052F }, { "Caucasian Albanian", 0x010530, 0x01056F }, { "Linear A", 0x010600, 0x01077F }, { "Cypriot Syllabary", 0x010800, 0x01083F }, { "Imperial Aramaic", 0x010840, 0x01085F }, { "Palmyrene", 0x010860, 0x01087F }, { "Nabataean", 0x010880, 0x0108AF }, { "Hatran", 0x0108E0, 0x0108FF }, { "Phoenician", 0x010900, 0x01091F }, { "Lydian", 0x010920, 0x01093F }, { "Meroitic Hieroglyphs", 0x010980, 0x01099F }, { "Meroitic Cursive", 0x0109A0, 0x0109FF }, { "Kharoshthi", 0x010A00, 0x010A5F }, { "Old South Arabian", 0x010A60, 0x010A7F }, { "Old North Arabian", 0x010A80, 0x010A9F }, { "Manichaean", 0x010AC0, 0x010AFF }, { "Avestan", 0x010B00, 0x010B3F }, { "Inscriptional Parthian", 0x010B40, 0x010B5F }, { "Inscriptional Pahlavi", 0x010B60, 0x010B7F }, { "Psalter Pahlavi", 0x010B80, 0x010BAF }, { "Old Turkic", 0x010C00, 0x010C4F }, { "Old Hungarian", 0x010C80, 0x010CFF }, { "Hanifi Rohingya", 0x010D00, 0x010D3F }, { "Rumi Numeral Symbols", 0x010E60, 0x010E7F }, { "Old Sogdian", 0x010F00, 0x010F2F }, { "Sogdian", 0x010F30, 0x010F6F }, { "Brahmi", 0x011000, 0x01107F }, { "Kaithi", 0x011080, 0x0110CF }, { "Sora Sompeng", 0x0110D0, 0x0110FF }, { "Chakma", 0x011100, 0x01114F }, { "Mahajani", 0x011150, 0x01117F }, { "Sharada", 0x011180, 0x0111DF }, { "Sinhala Archaic Numbers", 0x0111E0, 0x0111FF }, { "Khojki", 0x011200, 0x01124F }, { "Multani", 0x011280, 0x0112AF }, { "Khudawadi", 0x0112B0, 0x0112FF }, { "Grantha", 0x011300, 0x01137F }, { "Newa", 0x011400, 0x01147F }, { "Tirhuta", 0x011480, 0x0114DF }, { "Siddham", 0x011580, 0x0115FF }, { "Modi", 0x011600, 0x01165F }, { "Mongolian Supplement", 0x011660, 0x01167F }, { "Takri", 0x011680, 0x0116CF }, { "Ahom", 0x011700, 0x01173F }, { "Dogra", 0x011800, 0x01184F }, { "Warang Citi", 0x0118A0, 0x0118FF }, { "Zanabazar Square", 0x011A00, 0x011A4F }, { "Soyombo", 0x011A50, 0x011AAF }, { "Pau Cin Hau", 0x011AC0, 0x011AFF }, { "Bhaiksuki", 0x011C00, 0x011C6F }, { "Marchen", 0x011C70, 0x011CBF }, { "Masaram Gondi", 0x011D00, 0x011D5F }, { "Gunjala Gondi", 0x011D60, 0x011DAF }, { "Makasar", 0x011EE0, 0x011EFF }, { "Cuneiform", 0x012000, 0x0123FF }, { "Cuneiform Numbers and Punctuation", 0x012400, 0x01247F }, { "Early Dynastic Cuneiform", 0x012480, 0x01254F }, { "Egyptian Hieroglyphs", 0x013000, 0x01342F }, { "Anatolian Hieroglyphs", 0x014400, 0x01467F }, { "Bamum Supplement", 0x016800, 0x016A3F }, { "Mro", 0x016A40, 0x016A6F }, { "Bassa Vah", 0x016AD0, 0x016AFF }, { "Pahawh Hmong", 0x016B00, 0x016B8F }, { "Medefaidrin", 0x016E40, 0x016E9F }, { "Miao", 0x016F00, 0x016F9F }, { "Ideographic Symbols and Punctuation", 0x016FE0, 0x016FFF }, { "Tangut", 0x017000, 0x0187FF }, { "Tangut Components", 0x018800, 0x018AFF }, { "Kana Supplement", 0x01B000, 0x01B0FF }, { "Kana Extended-A", 0x01B100, 0x01B12F }, { "Nushu", 0x01B170, 0x01B2FF }, { "Duployan", 0x01BC00, 0x01BC9F }, { "Shorthand Format Controls", 0x01BCA0, 0x01BCAF }, { "Byzantine Musical Symbols", 0x01D000, 0x01D0FF }, { "Musical Symbols", 0x01D100, 0x01D1FF }, { "Ancient Greek Musical Notation", 0x01D200, 0x01D24F }, { "Mayan Numerals", 0x01D2E0, 0x01D2FF }, { "Tai Xuan Jing Symbols", 0x01D300, 0x01D35F }, { "Counting Rod Numerals", 0x01D360, 0x01D37F }, { "Mathematical Alphanumeric Symbols", 0x01D400, 0x01D7FF }, { "Sutton SignWriting", 0x01D800, 0x01DAAF }, { "Glagolitic Supplement", 0x01E000, 0x01E02F }, { "Mende Kikakui", 0x01E800, 0x01E8DF }, { "Adlam", 0x01E900, 0x01E95F }, { "Indic Siyaq Numbers", 0x01EC70, 0x01ECBF }, { "Arabic Mathematical Alphabetic Symbols", 0x01EE00, 0x01EEFF }, { "Mahjong Tiles", 0x01F000, 0x01F02F }, { "Domino Tiles", 0x01F030, 0x01F09F }, { "Playing Cards", 0x01F0A0, 0x01F0FF }, { "Enclosed Alphanumeric Supplement", 0x01F100, 0x01F1FF }, { "Enclosed Ideographic Supplement", 0x01F200, 0x01F2FF }, { "Miscellaneous Symbols and Pictographs", 0x01F300, 0x01F5FF }, { "Emoticons", 0x01F600, 0x01F64F }, { "Ornamental Dingbats", 0x01F650, 0x01F67F }, { "Transport and Map Symbols", 0x01F680, 0x01F6FF }, { "Alchemical Symbols", 0x01F700, 0x01F77F }, { "Geometric Shapes Extended", 0x01F780, 0x01F7FF }, { "Supplemental Arrows-C", 0x01F800, 0x01F8FF }, { "Supplemental Symbols and Pictographs", 0x01F900, 0x01F9FF }, { "Chess Symbols", 0x01FA00, 0x01FA6F }, { "CJK Unified Ideographs Extension B", 0x020000, 0x02A6DF }, { "CJK Unified Ideographs Extension C", 0x02A700, 0x02B73F }, { "CJK Unified Ideographs Extension D", 0x02B740, 0x02B81F }, { "CJK Unified Ideographs Extension E", 0x02B820, 0x02CEAF }, { "CJK Unified Ideographs Extension F", 0x02CEB0, 0x02EBEF }, { "CJK Compatibility Ideographs Supplement", 0x02F800, 0x02FA1F }, { "Tags", 0x0E0000, 0x0E007F }, { "Variation Selectors Supplement", 0x0E0100, 0x0E01EF }, { "Supplementary Private Use Area-A", 0x0F0000, 0x0FFFFF }, { "Supplementary Private Use Area-B", 0x100000, 0x10FFFF }, } blocks.length = #blocks function export.enum_blocks() return function (blocks, i) i = i + 1 local data = blocks[i] if not data then return nil end return i, unpack(data) end, blocks, 0 end function export.lookup_plane(codepoint) local i = floor(codepoint / 0x10000) return planes[i] or ("Plane %u"):format(i) end -- Binary search, to avoid iterating over entire table in order to look up the -- higher codepoints. function export.lookup_block(codepoint) local iStart, iEnd = 1, blocks.length or #blocks while iStart <= iEnd do local iMid = floor((iStart + iEnd) / 2) local range = blocks[iMid] if codepoint < range[2] then iEnd = iMid - 1 elseif codepoint <= range[3] then return range[1] else iStart = iMid + 1 end end error(string.format("No block found for codepoint U+%04X.", codepoint)) end function export.get_block_range(name) local range for i, block in ipairs(blocks) do if block[1] == name then range = block end end if range then return range[2], range[3] end end function export.is_valid_pagename(pagename) local has_nonws = false for cp in mw.ustring.gcodepoint(pagename) do if (cp == 0x0023) -- # or (cp == 0x005B) -- [ or (cp == 0x005D) -- ] or (cp == 0x007B) -- { or (cp == 0x007C) -- | or (cp == 0x007D) -- } or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block or (cp == 0xFFFD) -- REPLACEMENT CHARACTER then return false end local printable, result = export.is_printable(cp) if not printable then return false end if result ~= "space-separator" then has_nonws = true end end return has_nonws end local function manual_unpack(what, from) local result = {} from = from or 1 for i, item in ipairs(what) do if i >= from then table.insert(result, item) end end return unpack(result) end local function memo_lookup(loader, match_func, ...) local dots = { ... } local cache = {} local singles, ranges return function (codepoint) if not singles then singles, ranges = loader() end if singles[codepoint] then return match_func(codepoint, singles[codepoint]) end local lastlast = -1 for _, range in pairs(cache) do if (range[1] <= codepoint) and (codepoint <= range[2]) then return match_func(codepoint, unpack(range, 3)) end end for _, range in pairs(ranges) do if codepoint < range[1] then table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) }) return match_func(codepoint, unpack(dots)) elseif codepoint <= range[2] then table.insert(cache, { manual_unpack(range) }) return match_func(codepoint, manual_unpack(range, 3)) else lastlast = range[2] end end return match_func(codepoint) end end -- Get a codepoint's combining class value in [[Module:Unicode data/combining]], -- and return whether this value is not zero. Zero is assigned as the default -- if the combining class value is not found in this data module. -- That is, return true if character is combining, or false if it is not. -- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for -- more information. export.is_combining = memo_lookup(function () local m_comb = mw.loadData('Module:Unicode data/combining') return m_comb.single, m_comb.ranges end, function (codepoint, combining_class) return combining_class and combining_class ~= 0 or false end, 0) function export.add_dotted_circle(str) return (mw.ustring.gsub(str, ".", function(char) if export.is_combining(mw.ustring.codepoint(char)) then return '◌' .. char end end)) end local lookup_control = memo_lookup(function () local m_cc = mw.loadData('Module:Unicode data/control') return m_cc.single, m_cc.ranges end, function (codepoint, ccc) return ccc or "assigned" end, "assigned") function export.is_assigned(codepoint) return lookup_control(codepoint) ~= "unassigned" end function export.is_printable(codepoint) local result = lookup_control(codepoint) return (result == "assigned") or (result == "space-separator"), result end function export.is_whitespace(codepoint) local result = lookup_control(codepoint) return (result == "space-separator"), result end -- to be used in language-neutral context only (e.g. character lists) local script_pats -- Scripts that consist entirely of characters from another script. local script_blacklist = { ["Latf"] = true; ["Hans"] = true; ["Hant"] = true; ["Kore"] = true; ["Jpan"] = true; ["fa-Arab"] = true; ["kk-Arab"] = true; ["ks-Arab"] = true; ["ku-Arab"] = true; ["mzn-Arab"] = true; ["ota-Arab"] = true; ["pa-Arab"] = true; ["ps-Arab"] = true; ["sd-Arab"] = true; ["tt-Arab"] = true; ["ug-Arab"] = true; ["ur-Arab"] = true; ["nv-Latn"] = true; ["pjt-Latn"] = true; ["Zyyy"] = true; } --[[ Problem scripts: Grek and polytonic, Cyrl and Cyrs, Latn and Latinx. In each key-value pair, the value should take precedence over the key. ]] local overridden_by = { ["Cyrs"] = "Cyrl", ["polytonic"] = "Grek", ["Latinx"] = "Latn", } local script_cache = {} function export.get_script(codepoint) local text if type(codepoint) == "number" then text = mw.ustring.char(codepoint) elseif type(codepoint) == "string" then text = codepoint else error("Argument to get_script should be a number (codepoint) or string.") end for pat, sc in pairs(script_cache) do if mw.ustring.match(text, pat) and not overridden_by[sc] then return sc end end if not script_pats then local m_scripts = mw.loadData("Module:scripts/data") script_pats = {} for sc, info in pairs(m_scripts) do if info.characters and not script_blacklist[sc] then script_pats[sc] = "[" .. info.characters .. "]" end end end for sc, pat in pairs(script_pats) do if mw.ustring.match(text, pat) then local overriding = overridden_by[sc] if overriding and script_pats[overriding] and mw.ustring.match(text, script_pats[overriding]) then script_cache[script_pats[overriding]] = overriding return overriding else script_cache[pat] = sc return sc end end end return "None" end local function sortRange(range1, range2) return range1[1] < range2[1] end --[[ Binary search: more efficient for the longer lists of codepoint ranges than for the shorter ones. ]] local function binary_search(ranges, value) if not ranges then return nil end -- Initialize numbers. local iStart, iMid = 1, 0 -- Can't use # because table is loaded by mw.loadData. local iEnd = ranges.length or require("Module:table").size(ranges) if iEnd == 0 then return nil end local iterations = 0 -- Do search. while iStart <= iEnd do iterations = iterations + 1 -- Calculate middle. iMid = floor((iStart + iEnd) / 2) -- Get compare value. local range = ranges[iMid] if range[1] > value then iEnd = iMid - 1 -- Return matching index. Assumes there are no duplicates. elseif value <= range[2] then return range -- Keep searching. else iStart = iMid + 1 end end return nil end local function look_up_in_order(number, ranges) for i, range in ipairs(ranges) do if number < range[1] then return nil elseif number <= range[2] then return range[3] end end end -- Save previously used codepoint ranges in case another character is in the -- same range. local ranges_cache = {} --[=[ Takes a codepoint or a character and finds the script code (if any) that is appropriate for it based on the codepoint, using the data module [[Module:Unicode data/scripts]]. The data module was generated from the patterns in [[Module:scripts/data]] using [[Module:User:Erutuon/script recognition]]. Converts the character to a codepoint. Returns a script code if the codepoint is in the list of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to, else returns "None". ]=] function export.char_to_script(char) local lookup = mw.loadData("Module:Unicode data/scripts") local t = type(char) local codepoint if t == "string" then local etc codepoint, etc = mw.ustring.codepoint(char) if etc then error("Argument to char_to_script should be a single character.") end elseif t == "number" then codepoint = char else error("Argument to char_to_script should be a string or a number, but its type is " .. t .. ".") end local individual_match = lookup.individual[codepoint] if individual_match then return individual_match else local script = look_up_in_order(codepoint, ranges_cache) if script then return script end local index = floor(codepoint / 0x1000) script = look_up_in_order(index, lookup.blocks) if script then return script end local range = binary_search(lookup[index], codepoint) if range then table.insert(ranges_cache, range) table.sort(ranges_cache, sortRange) return range[3] end end return "None" end function export.find_best_script(text) local scripts = {} for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do local script = export.char_to_script(character) scripts[script] = (scripts[script] or 0) + 1 end local best_script local greatest_count = 0 for script, count in pairs(scripts) do if count > greatest_count then best_script = script greatest_count = count end end return best_script end local unsupported_title = { [0x0020] = "Unsupported titles/Space"; [0x0023] = "Unsupported titles/Number sign"; [0x002E] = "Unsupported titles/Full stop"; [0x003A] = "Unsupported titles/Colon"; [0x003C] = "Unsupported titles/Less than"; [0x003E] = "Unsupported titles/Greater than"; [0x005B] = "Unsupported titles/Left square bracket"; [0x005D] = "Unsupported titles/Right square bracket"; [0x005F] = "Unsupported titles/Low line"; [0x007B] = "Unsupported titles/Left curly bracket"; [0x007C] = "Unsupported titles/Vertical line"; [0x007D] = "Unsupported titles/Right curly bracket"; [0x1680] = "Unsupported titles/Ogham space"; [0xFFFD] = "Unsupported titles/Replacement character"; } function export.get_entry_title(codepoint) if unsupported_title[codepoint] then return unsupported_title[codepoint] end if lookup_control(codepoint) ~= "assigned" then return nil end return mw.ustring.char(codepoint) end return export