diff options
Diffstat (limited to 'mac/.config/mpv/script-modules/utf8/charclass')
9 files changed, 627 insertions, 0 deletions
diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/builder.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/builder.lua new file mode 100644 index 0000000..9d9c603 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/builder.lua @@ -0,0 +1,128 @@ +return function(utf8) + +local byte = utf8.byte +local unpack = utf8.config.unpack + +local builder = {} +local mt = {__index = builder} + +utf8.regex.compiletime.charclass.builder = builder + +function builder.new() + return setmetatable({}, mt) +end + +function builder:invert() + self.inverted = true + return self +end + +function builder:internal() -- is it enclosed in [] + self.internal = true + return self +end + +function builder:with_codes(...) + local codes = {...} + self.codes = self.codes or {} + + for _, v in ipairs(codes) do + table.insert(self.codes, type(v) == "number" and v or byte(v)) + end + + table.sort(self.codes) + return self +end + +function builder:with_ranges(...) + local ranges = {...} + self.ranges = self.ranges or {} + + for _, v in ipairs(ranges) do + table.insert(self.ranges, v) + end + + return self +end + +function builder:with_classes(...) + local classes = {...} + self.classes = self.classes or {} + + for _, v in ipairs(classes) do + table.insert(self.classes, v) + end + + return self +end + +function builder:without_classes(...) + local not_classes = {...} + self.not_classes = self.not_classes or {} + + for _, v in ipairs(not_classes) do + table.insert(self.not_classes, v) + end + + return self +end + +function builder:include(b) + if not b.inverted then + if b.codes then + self:with_codes(unpack(b.codes)) + end + if b.ranges then + self:with_ranges(unpack(b.ranges)) + end + if b.classes then + self:with_classes(unpack(b.classes)) + end + if b.not_classes then + self:without_classes(unpack(b.not_classes)) + end + else + self.includes = self.includes or {} + self.includes[#self.includes + 1] = b + end + return self +end + +function builder:build() + if self.codes and #self.codes == 1 and not self.inverted and not self.ranges and not self.classes and not self.not_classes and not self.includes then + return "{test = function(self, cc) return cc == " .. self.codes[1] .. " end}" + else + local codes_list = table.concat(self.codes or {}, ', ') + local ranges_list = '' + for i, r in ipairs(self.ranges or {}) do ranges_list = ranges_list .. (i > 1 and ', {' or '{') .. tostring(r[1]) .. ', ' .. tostring(r[2]) .. '}' end + local classes_list = '' + if self.classes then classes_list = "'" .. table.concat(self.classes, "', '") .. "'" end + local not_classes_list = '' + if self.not_classes then not_classes_list = "'" .. table.concat(self.not_classes, "', '") .. "'" end + + local subs_list = '' + for i, r in ipairs(self.includes or {}) do subs_list = subs_list .. (i > 1 and ', ' or '') .. r:build() .. '' end + + local src = [[cl.new():with_codes( + ]] .. codes_list .. [[ + ):with_ranges( + ]] .. ranges_list .. [[ + ):with_classes( + ]] .. classes_list .. [[ + ):without_classes( + ]] .. not_classes_list .. [[ + ):with_subs( + ]] .. subs_list .. [[ + )]] + + if self.inverted then + src = src .. ':invert()' + end + + return src + end +end + +return builder + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/parser.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/parser.lua new file mode 100644 index 0000000..4f1d4a9 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/parser.lua @@ -0,0 +1,21 @@ +return function(utf8) + +utf8.config.compiletime_charclasses = utf8.config.compiletime_charclasses or { + utf8:require "charclass.compiletime.vanilla", + utf8:require "charclass.compiletime.range", + utf8:require "charclass.compiletime.stub", +} + +function utf8.regex.compiletime.charclass.parse(regex, c, bs, ctx) + utf8.debug("parse charclass():", regex, c, bs, regex[bs]) + for _, p in ipairs(utf8.config.compiletime_charclasses) do + local charclass, nbs = p(regex, c, bs, ctx) + if charclass then + ctx.prev_class = charclass:build() + utf8.debug("cc", ctx.prev_class, _, c, bs, nbs) + return charclass, nbs + end + end +end + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/range.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/range.lua new file mode 100644 index 0000000..2996234 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/range.lua @@ -0,0 +1,44 @@ +return function(utf8) + +local cl = utf8.regex.compiletime.charclass.builder + +local next = utf8.util.next + +return function(str, c, bs, ctx) + if not ctx.internal then return end + + local nbs = bs + + local r1, r2 + + local c, nbs = c, bs + if c == '%' then + c, nbs = next(str, nbs) + r1 = c + else + r1 = c + end + + utf8.debug("range r1", r1, nbs) + + c, nbs = next(str, nbs) + if c ~= '-' then return end + + c, nbs = next(str, nbs) + if c == '%' then + c, nbs = next(str, nbs) + r2 = c + elseif c ~= '' and c ~= ']' then + r2 = c + end + + utf8.debug("range r2", r2, nbs) + + if r1 and r2 then + return cl.new():with_ranges{utf8.byte(r1), utf8.byte(r2)}, utf8.next(str, nbs) - bs + else + return + end +end + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/stub.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/stub.lua new file mode 100644 index 0000000..395d05c --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/stub.lua @@ -0,0 +1,9 @@ +return function(utf8) + +local cl = utf8.regex.compiletime.charclass.builder + +return function(str, c, bs, ctx) + return cl.new():with_codes(c), utf8.next(str, bs) - bs +end + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/vanilla.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/vanilla.lua new file mode 100644 index 0000000..8e7f0b3 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/vanilla.lua @@ -0,0 +1,131 @@ +return function(utf8) + +local cl = utf8:require "charclass.compiletime.builder" + +local next = utf8.util.next + +local token = 1 + +local function parse(str, c, bs, ctx) + local tttt = token + token = token + 1 + + local class + local nbs = bs + utf8.debug("cc_parse", tttt, str, c, nbs, next(str, nbs)) + + if c == '%' then + c, nbs = next(str, bs) + if c == '' then + error("malformed pattern (ends with '%')") + end + local _c = utf8.raw.lower(c) + local matched + if _c == 'a' then + matched = ('alpha') + elseif _c == 'c' then + matched = ('cntrl') + elseif _c == 'd' then + matched = ('digit') + elseif _c == 'g' then + matched = ('graph') + elseif _c == 'l' then + matched = ('lower') + elseif _c == 'p' then + matched = ('punct') + elseif _c == 's' then + matched = ('space') + elseif _c == 'u' then + matched = ('upper') + elseif _c == 'w' then + matched = ('alnum') + elseif _c == 'x' then + matched = ('xdigit') + end + + if matched then + if _c ~= c then + class = cl.new():without_classes(matched) + else + class = cl.new():with_classes(matched) + end + elseif _c == 'z' then + class = cl.new():with_codes(0) + if _c ~= c then + class = class:invert() + end + else + class = cl.new():with_codes(c) + end + elseif c == '[' and not ctx.internal then + local old_internal = ctx.internal + ctx.internal = true + class = cl.new() + local firstletter = true + while true do + local prev_nbs = nbs + c, nbs = next(str, nbs) + utf8.debug("next", tttt, c, nbs) + if c == '^' and firstletter then + class:invert() + local nc, nnbs = next(str, nbs) + if nc == ']' then + class:with_codes(nc) + nbs = nnbs + end + elseif c == ']' then + if firstletter then + class:with_codes(c) + else + utf8.debug('] on pos', tttt, nbs) + break + end + elseif c == '' then + error "malformed pattern (missing ']')" + else + local sub_class, skip = utf8.regex.compiletime.charclass.parse(str, c, nbs, ctx) + nbs = prev_nbs + skip + utf8.debug("include", tttt, bs, prev_nbs, nbs, skip) + class:include(sub_class) + end + firstletter = false + end + ctx.internal = old_internal + elseif c == '.' then + if not ctx.internal then + class = cl.new():invert() + else + class = cl.new():with_codes(c) + end + end + + return class, utf8.next(str, nbs) - bs +end + +return parse + +end + +--[[ + x: (where x is not one of the magic characters ^$()%.[]*+-?) represents the character x itself. + .: (a dot) represents all characters. + %a: represents all letters. + %c: represents all control characters. + %d: represents all digits. + %g: represents all printable characters except space. + %l: represents all lowercase letters. + %p: represents all punctuation characters. + %s: represents all space characters. + %u: represents all uppercase letters. + %w: represents all alphanumeric characters. + %x: represents all hexadecimal digits. + %x: (where x is any non-alphanumeric character) represents the character x. This is the standard way to escape the magic characters. Any non-alphanumeric character (including all punctuation characters, even the non-magical) can be preceded by a '%' when used to represent itself in a pattern. + [set]: represents the class which is the union of all characters in set. A range of characters can be specified by separating the end characters of the range, in ascending order, with a '-'. All classes %x described above can also be used as components in set. All other characters in set represent themselves. For example, [%w_] (or [_%w]) represents all alphanumeric characters plus the underscore, [0-7] represents the octal digits, and [0-7%l%-] represents the octal digits plus the lowercase letters plus the '-' character. + + You can put a closing square bracket in a set by positioning it as the first character in the set. You can put a hyphen in a set by positioning it as the first or the last character in the set. (You can also use an escape for both cases.) + + The interaction between ranges and classes is not defined. Therefore, patterns like [%a-z] or [a-%%] have no meaning. + [^set]: represents the complement of set, where set is interpreted as above. + +For all classes represented by single letters (%a, %c, etc.), the corresponding uppercase letter represents the complement of the class. For instance, %S represents all non-space characters. +]] diff --git a/mac/.config/mpv/script-modules/utf8/charclass/runtime/base.lua b/mac/.config/mpv/script-modules/utf8/charclass/runtime/base.lua new file mode 100644 index 0000000..33d7713 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/runtime/base.lua @@ -0,0 +1,184 @@ +return function(utf8) + +local class = {} +local mt = {__index = class} + +local utf8gensub = utf8.gensub + +function class.new() + return setmetatable({}, mt) +end + +function class:invert() + self.inverted = true + return self +end + +function class:with_codes(...) + local codes = {...} + self.codes = self.codes or {} + + for _, v in ipairs(codes) do + table.insert(self.codes, v) + end + + table.sort(self.codes) + return self +end + +function class:with_ranges(...) + local ranges = {...} + self.ranges = self.ranges or {} + + for _, v in ipairs(ranges) do + table.insert(self.ranges, v) + end + + return self +end + +function class:with_classes(...) + local classes = {...} + self.classes = self.classes or {} + + for _, v in ipairs(classes) do + table.insert(self.classes, v) + end + + return self +end + +function class:without_classes(...) + local not_classes = {...} + self.not_classes = self.not_classes or {} + + for _, v in ipairs(not_classes) do + table.insert(self.not_classes, v) + end + + return self +end + +function class:with_subs(...) + local subs = {...} + self.subs = self.subs or {} + + for _, v in ipairs(subs) do + table.insert(self.subs, v) + end + + return self +end + +function class:in_codes(item) + if not self.codes or #self.codes == 0 then return nil end + + local head, tail = 1, #self.codes + local mid = math.floor((head + tail)/2) + while (tail - head) > 1 do + if self.codes[mid] > item then + tail = mid + else + head = mid + end + mid = math.floor((head + tail)/2) + end + if self.codes[head] == item then + return true, head + elseif self.codes[tail] == item then + return true, tail + else + return false + end +end + +function class:in_ranges(char_code) + if not self.ranges or #self.ranges == 0 then return nil end + + for _,r in ipairs(self.ranges) do + if r[1] <= char_code and char_code <= r[2] then + return true + end + end + return false +end + +function class:in_classes(char_code) + if not self.classes or #self.classes == 0 then return nil end + + for _, class in ipairs(self.classes) do + if self:is(class, char_code) then + return true + end + end + return false +end + +function class:in_not_classes(char_code) + if not self.not_classes or #self.not_classes == 0 then return nil end + + for _, class in ipairs(self.not_classes) do + if self:is(class, char_code) then + return true + end + end + return false +end + +function class:is(class, char_code) + error("not implemented") +end + +function class:in_subs(char_code) + if not self.subs or #self.subs == 0 then return nil end + + for _, c in ipairs(self.subs) do + if not c:test(char_code) then + return false + end + end + return true +end + +function class:test(char_code) + local result = self:do_test(char_code) + -- utf8.debug('class:test', result, "'" .. (char_code and utf8.char(char_code) or 'nil') .. "'", char_code) + return result +end + +function class:do_test(char_code) + if not char_code then return false end + local in_not_classes = self:in_not_classes(char_code) + if in_not_classes then + return not not self.inverted + end + local in_codes = self:in_codes(char_code) + if in_codes then + return not self.inverted + end + local in_ranges = self:in_ranges(char_code) + if in_ranges then + return not self.inverted + end + local in_classes = self:in_classes(char_code) + if in_classes then + return not self.inverted + end + local in_subs = self:in_subs(char_code) + if in_subs then + return not self.inverted + end + if (in_codes == nil) + and (in_ranges == nil) + and (in_classes == nil) + and (in_subs == nil) + and (in_not_classes == false) then + return not self.inverted + else + return not not self.inverted + end +end + +return class + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/runtime/dummy.lua b/mac/.config/mpv/script-modules/utf8/charclass/runtime/dummy.lua new file mode 100644 index 0000000..1faddc1 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/runtime/dummy.lua @@ -0,0 +1,41 @@ +return function(utf8) + +local base = utf8:require "charclass.runtime.base" + +local dummy = setmetatable({}, {__index = base}) +local mt = {__index = dummy} + +function dummy.new() + return setmetatable({}, mt) +end + +function dummy:with_classes(...) + local classes = {...} + for _, c in ipairs(classes) do + if c == 'alpha' then self:with_ranges({65, 90}, {97, 122}) + elseif c == 'cntrl' then self:with_ranges({0, 31}):with_codes(127) + elseif c == 'digit' then self:with_ranges({48, 57}) + elseif c == 'graph' then self:with_ranges({1, 8}, {14, 31}, {33, 132}, {134, 159}, {161, 5759}, {5761, 8191}, {8203, 8231}, {8234, 8238}, {8240, 8286}, {8288, 12287}) + elseif c == 'lower' then self:with_ranges({97, 122}) + elseif c == 'punct' then self:with_ranges({33, 47}, {58, 64}, {91, 96}, {123, 126}) + elseif c == 'space' then self:with_ranges({9, 13}):with_codes(32, 133, 160, 5760):with_ranges({8192, 8202}):with_codes(8232, 8233, 8239, 8287, 12288) + elseif c == 'upper' then self:with_ranges({65, 90}) + elseif c == 'alnum' then self:with_ranges({48, 57}, {65, 90}, {97, 122}) + elseif c == 'xdigit' then self:with_ranges({48, 57}, {65, 70}, {97, 102}) + end + end + return self +end + +function dummy:without_classes(...) + local classes = {...} + if #classes > 0 then + return self:with_subs(dummy.new():with_classes(...):invert()) + else + return self + end +end + +return dummy + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/runtime/init.lua b/mac/.config/mpv/script-modules/utf8/charclass/runtime/init.lua new file mode 100644 index 0000000..e71d037 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/runtime/init.lua @@ -0,0 +1,22 @@ +return function(utf8) + +local provided = utf8.config.runtime_charclasses + +if provided then + if type(provided) == "table" then + return provided + elseif type(provided) == "function" then + return provided(utf8) + else + return utf8:require(provided) + end +end + +local ffi = pcall(require, "ffi") +if not ffi then + return utf8:require "charclass.runtime.dummy" +else + return utf8:require "charclass.runtime.native" +end + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/runtime/native.lua b/mac/.config/mpv/script-modules/utf8/charclass/runtime/native.lua new file mode 100644 index 0000000..f7b7890 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/runtime/native.lua @@ -0,0 +1,47 @@ +return function(utf8) + +os.setlocale(utf8.config.locale, "ctype") + +local ffi = require("ffi") +ffi.cdef[[ + int iswalnum(int c); + int iswalpha(int c); + int iswascii(int c); + int iswblank(int c); + int iswcntrl(int c); + int iswdigit(int c); + int iswgraph(int c); + int iswlower(int c); + int iswprint(int c); + int iswpunct(int c); + int iswspace(int c); + int iswupper(int c); + int iswxdigit(int c); +]] + +local base = utf8:require "charclass.runtime.base" + +local native = setmetatable({}, {__index = base}) +local mt = {__index = native} + +function native.new() + return setmetatable({}, mt) +end + +function native:is(class, char_code) + if class == 'alpha' then return ffi.C.iswalpha(char_code) ~= 0 + elseif class == 'cntrl' then return ffi.C.iswcntrl(char_code) ~= 0 + elseif class == 'digit' then return ffi.C.iswdigit(char_code) ~= 0 + elseif class == 'graph' then return ffi.C.iswgraph(char_code) ~= 0 + elseif class == 'lower' then return ffi.C.iswlower(char_code) ~= 0 + elseif class == 'punct' then return ffi.C.iswpunct(char_code) ~= 0 + elseif class == 'space' then return ffi.C.iswspace(char_code) ~= 0 + elseif class == 'upper' then return ffi.C.iswupper(char_code) ~= 0 + elseif class == 'alnum' then return ffi.C.iswalnum(char_code) ~= 0 + elseif class == 'xdigit' then return ffi.C.iswxdigit(char_code) ~= 0 + end +end + +return native + +end |
