Module:Language/scripts

From TWC Archive
Jump to navigation Jump to search

local p = {}

local gsub = mw.ustring.gsub local length = mw.ustring.len local floor = math.floor local UTF8Char = "[%z\1-\127\194-\244][\128-\191]*"

local codepoint_data = mw.loadData("Module:language/scripts/codepoints")

local data = require("Module:Language/scripts/data")

function p.print(frame) local scriptCode = frame.args[1] local scriptData = scriptCode and data[scriptCode] or "Please supply a valid script code." local characters = scriptData and scriptData.characters or "No characters found for " .. scriptCode .. "." return characters end

local script = {}

-- Based on the Script:countCharacters() function of Module:scripts on Wiktionary local function countCharacters(text, scriptCode) if not data[scriptCode]["characters"] then return 0 else local _, count = gsub(text, "[" .. data[scriptCode]["characters"] .. "]", "") return count end end

function p.isLatn(text) if type(tostring(text)) == "string" then local count = countCharacters(text, "Latn") if count < (length(text) / 4) then -- Only 25% of characters in string are Latin return false else return true end else return nil end end

function p.Latin(frame) local text = frame.args[1] return p.isLatn(text) end

local ignore_script = require("Module:table").listToSet{ "Zinh", "Zmth", "Zsym", "Zsye", "Zxxx", "Zyyy", "Zzzz" }

local function map(func, t) local array = {} if t[1] then for i, v in ipairs(t) do array[i] = func(v, i, t) end else local i = 0 for k, v in pairs(t) do i = i + 1 array[i] = func(v, k, t) end end return array end

local function filter(t, func) local new_t = {}

if t[1] then local new_t_i = 0 for i, v in ipairs(t) do if func(v, i, t) then new_t_i = new_t_i + 1 new_t[new_t_i] = v end end else for k, v in pairs(t) do if func(v, k, t) then new_t[k] = v end end end

return new_t end

local function sortRange(range1, range2) return range1[1] < range2[1] end

--[[ Binary search: efficient for long lists of codepoint ranges. ]] local function binarySearch(ranges, value) if not ranges then return nil end

-- Initialize numbers. local bottom, i, top = 1, 0, ranges.length

if top == 0 then return nil end

-- Do search. while bottom <= top do -- Calculate current index. i = floor((bottom + top) / 2)

-- Get range array; for instance, { 0x41, 0x7A, "Latn"}. local range = ranges[i]

if value < range[1] then top = i - 1

-- Return matching range array so that it can be placed in cache. elseif value <= range[2] then return range

else bottom = i + 1 end end

return nil end

--[[ -- For debugging local function toHex(number) return ("0x%X"):format(number) end

local function logRange(range, number) return mw.log(toHex(range[1]), toHex(number) .. " (" .. mw.ustring.char(number) .. ")", toHex(range[2]), range[3]) end --]]

local function lookUpInOrder(number, ranges) for i, range in ipairs(ranges) do if number < range[1] then return nil elseif number <= range[2] then return range[3] end end end

-- Save previously used codepoint ranges in case another character is in the -- same range. local rangesCache = {}

--[=[ Takes a codepoint and returns the script code that is appropriate for it, based on the data module Module:Language/scripts/codepoints.

The data module uses the official Unicode script codes.

Returns a script code from the codepoint-to-script map, or one of the ranges in the array of ranges, else returns Zzzz. ]=] function p.codepointToScript(codepoint) local lookup = codepoint_data local t = type(codepoint) if t ~= "number" then error("Argument to codepointToScript should be a number, but its type is " .. t .. ".") end

local individualMatch = lookup.individual[codepoint] if individualMatch then return individualMatch else local script = lookUpInOrder(codepoint, rangesCache) if script then return script end

local range = binarySearch(lookup.ranges, codepoint) if range then table.insert(rangesCache, range) table.sort(rangesCache, sortRange) return range[3] end end

return "Zzzz" end

local function charToScript(char) return p.codepointToScript(mw.ustring.codepoint(char)) end

function p.countScripts(text) if type(text) ~= "string" then error("countScripts requires a string") end local scriptCounts = {} local codepointToScript = p.codepointToScript for codepoint in mw.ustring.gcodepoint(text) do local script = codepointToScript(codepoint) if script then if not scriptCounts[script] then scriptCounts[script] = 0 end scriptCounts[script] = scriptCounts[script] + 1 end end

return scriptCounts end

function p.getScript(text) local scripts = {} local i = 0 for code in pairs(p.countScripts(text)) do i = i + 1 scripts[i] = code end

scripts = filter(scripts, function (scCode) return not ignore_script[scCode] end)

if not scripts[2] then return scripts[1] else error("More than one script was found for " .. text) end end

function p.showScripts(frame) return table.concat( map(function(arg) return "* " .. arg .. ": " .. table.concat( map(function(count, script) return script .. " (" .. count .. ")" end, p.countScripts(arg)), ", ") end, frame.args), "\n") end

return p