diff options
Diffstat (limited to 'mac/.config/mpv/script-modules/utf8/charclass/compiletime')
5 files changed, 333 insertions, 0 deletions
diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/builder.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/builder.lua new file mode 100644 index 0000000..9d9c603 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/builder.lua @@ -0,0 +1,128 @@ +return function(utf8) + +local byte = utf8.byte +local unpack = utf8.config.unpack + +local builder = {} +local mt = {__index = builder} + +utf8.regex.compiletime.charclass.builder = builder + +function builder.new() + return setmetatable({}, mt) +end + +function builder:invert() + self.inverted = true + return self +end + +function builder:internal() -- is it enclosed in [] + self.internal = true + return self +end + +function builder:with_codes(...) + local codes = {...} + self.codes = self.codes or {} + + for _, v in ipairs(codes) do + table.insert(self.codes, type(v) == "number" and v or byte(v)) + end + + table.sort(self.codes) + return self +end + +function builder:with_ranges(...) + local ranges = {...} + self.ranges = self.ranges or {} + + for _, v in ipairs(ranges) do + table.insert(self.ranges, v) + end + + return self +end + +function builder:with_classes(...) + local classes = {...} + self.classes = self.classes or {} + + for _, v in ipairs(classes) do + table.insert(self.classes, v) + end + + return self +end + +function builder:without_classes(...) + local not_classes = {...} + self.not_classes = self.not_classes or {} + + for _, v in ipairs(not_classes) do + table.insert(self.not_classes, v) + end + + return self +end + +function builder:include(b) + if not b.inverted then + if b.codes then + self:with_codes(unpack(b.codes)) + end + if b.ranges then + self:with_ranges(unpack(b.ranges)) + end + if b.classes then + self:with_classes(unpack(b.classes)) + end + if b.not_classes then + self:without_classes(unpack(b.not_classes)) + end + else + self.includes = self.includes or {} + self.includes[#self.includes + 1] = b + end + return self +end + +function builder:build() + if self.codes and #self.codes == 1 and not self.inverted and not self.ranges and not self.classes and not self.not_classes and not self.includes then + return "{test = function(self, cc) return cc == " .. self.codes[1] .. " end}" + else + local codes_list = table.concat(self.codes or {}, ', ') + local ranges_list = '' + for i, r in ipairs(self.ranges or {}) do ranges_list = ranges_list .. (i > 1 and ', {' or '{') .. tostring(r[1]) .. ', ' .. tostring(r[2]) .. '}' end + local classes_list = '' + if self.classes then classes_list = "'" .. table.concat(self.classes, "', '") .. "'" end + local not_classes_list = '' + if self.not_classes then not_classes_list = "'" .. table.concat(self.not_classes, "', '") .. "'" end + + local subs_list = '' + for i, r in ipairs(self.includes or {}) do subs_list = subs_list .. (i > 1 and ', ' or '') .. r:build() .. '' end + + local src = [[cl.new():with_codes( + ]] .. codes_list .. [[ + ):with_ranges( + ]] .. ranges_list .. [[ + ):with_classes( + ]] .. classes_list .. [[ + ):without_classes( + ]] .. not_classes_list .. [[ + ):with_subs( + ]] .. subs_list .. [[ + )]] + + if self.inverted then + src = src .. ':invert()' + end + + return src + end +end + +return builder + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/parser.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/parser.lua new file mode 100644 index 0000000..4f1d4a9 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/parser.lua @@ -0,0 +1,21 @@ +return function(utf8) + +utf8.config.compiletime_charclasses = utf8.config.compiletime_charclasses or { + utf8:require "charclass.compiletime.vanilla", + utf8:require "charclass.compiletime.range", + utf8:require "charclass.compiletime.stub", +} + +function utf8.regex.compiletime.charclass.parse(regex, c, bs, ctx) + utf8.debug("parse charclass():", regex, c, bs, regex[bs]) + for _, p in ipairs(utf8.config.compiletime_charclasses) do + local charclass, nbs = p(regex, c, bs, ctx) + if charclass then + ctx.prev_class = charclass:build() + utf8.debug("cc", ctx.prev_class, _, c, bs, nbs) + return charclass, nbs + end + end +end + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/range.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/range.lua new file mode 100644 index 0000000..2996234 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/range.lua @@ -0,0 +1,44 @@ +return function(utf8) + +local cl = utf8.regex.compiletime.charclass.builder + +local next = utf8.util.next + +return function(str, c, bs, ctx) + if not ctx.internal then return end + + local nbs = bs + + local r1, r2 + + local c, nbs = c, bs + if c == '%' then + c, nbs = next(str, nbs) + r1 = c + else + r1 = c + end + + utf8.debug("range r1", r1, nbs) + + c, nbs = next(str, nbs) + if c ~= '-' then return end + + c, nbs = next(str, nbs) + if c == '%' then + c, nbs = next(str, nbs) + r2 = c + elseif c ~= '' and c ~= ']' then + r2 = c + end + + utf8.debug("range r2", r2, nbs) + + if r1 and r2 then + return cl.new():with_ranges{utf8.byte(r1), utf8.byte(r2)}, utf8.next(str, nbs) - bs + else + return + end +end + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/stub.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/stub.lua new file mode 100644 index 0000000..395d05c --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/stub.lua @@ -0,0 +1,9 @@ +return function(utf8) + +local cl = utf8.regex.compiletime.charclass.builder + +return function(str, c, bs, ctx) + return cl.new():with_codes(c), utf8.next(str, bs) - bs +end + +end diff --git a/mac/.config/mpv/script-modules/utf8/charclass/compiletime/vanilla.lua b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/vanilla.lua new file mode 100644 index 0000000..8e7f0b3 --- /dev/null +++ b/mac/.config/mpv/script-modules/utf8/charclass/compiletime/vanilla.lua @@ -0,0 +1,131 @@ +return function(utf8) + +local cl = utf8:require "charclass.compiletime.builder" + +local next = utf8.util.next + +local token = 1 + +local function parse(str, c, bs, ctx) + local tttt = token + token = token + 1 + + local class + local nbs = bs + utf8.debug("cc_parse", tttt, str, c, nbs, next(str, nbs)) + + if c == '%' then + c, nbs = next(str, bs) + if c == '' then + error("malformed pattern (ends with '%')") + end + local _c = utf8.raw.lower(c) + local matched + if _c == 'a' then + matched = ('alpha') + elseif _c == 'c' then + matched = ('cntrl') + elseif _c == 'd' then + matched = ('digit') + elseif _c == 'g' then + matched = ('graph') + elseif _c == 'l' then + matched = ('lower') + elseif _c == 'p' then + matched = ('punct') + elseif _c == 's' then + matched = ('space') + elseif _c == 'u' then + matched = ('upper') + elseif _c == 'w' then + matched = ('alnum') + elseif _c == 'x' then + matched = ('xdigit') + end + + if matched then + if _c ~= c then + class = cl.new():without_classes(matched) + else + class = cl.new():with_classes(matched) + end + elseif _c == 'z' then + class = cl.new():with_codes(0) + if _c ~= c then + class = class:invert() + end + else + class = cl.new():with_codes(c) + end + elseif c == '[' and not ctx.internal then + local old_internal = ctx.internal + ctx.internal = true + class = cl.new() + local firstletter = true + while true do + local prev_nbs = nbs + c, nbs = next(str, nbs) + utf8.debug("next", tttt, c, nbs) + if c == '^' and firstletter then + class:invert() + local nc, nnbs = next(str, nbs) + if nc == ']' then + class:with_codes(nc) + nbs = nnbs + end + elseif c == ']' then + if firstletter then + class:with_codes(c) + else + utf8.debug('] on pos', tttt, nbs) + break + end + elseif c == '' then + error "malformed pattern (missing ']')" + else + local sub_class, skip = utf8.regex.compiletime.charclass.parse(str, c, nbs, ctx) + nbs = prev_nbs + skip + utf8.debug("include", tttt, bs, prev_nbs, nbs, skip) + class:include(sub_class) + end + firstletter = false + end + ctx.internal = old_internal + elseif c == '.' then + if not ctx.internal then + class = cl.new():invert() + else + class = cl.new():with_codes(c) + end + end + + return class, utf8.next(str, nbs) - bs +end + +return parse + +end + +--[[ + x: (where x is not one of the magic characters ^$()%.[]*+-?) represents the character x itself. + .: (a dot) represents all characters. + %a: represents all letters. + %c: represents all control characters. + %d: represents all digits. + %g: represents all printable characters except space. + %l: represents all lowercase letters. + %p: represents all punctuation characters. + %s: represents all space characters. + %u: represents all uppercase letters. + %w: represents all alphanumeric characters. + %x: represents all hexadecimal digits. + %x: (where x is any non-alphanumeric character) represents the character x. This is the standard way to escape the magic characters. Any non-alphanumeric character (including all punctuation characters, even the non-magical) can be preceded by a '%' when used to represent itself in a pattern. + [set]: represents the class which is the union of all characters in set. A range of characters can be specified by separating the end characters of the range, in ascending order, with a '-'. All classes %x described above can also be used as components in set. All other characters in set represent themselves. For example, [%w_] (or [_%w]) represents all alphanumeric characters plus the underscore, [0-7] represents the octal digits, and [0-7%l%-] represents the octal digits plus the lowercase letters plus the '-' character. + + You can put a closing square bracket in a set by positioning it as the first character in the set. You can put a hyphen in a set by positioning it as the first or the last character in the set. (You can also use an escape for both cases.) + + The interaction between ranges and classes is not defined. Therefore, patterns like [%a-z] or [a-%%] have no meaning. + [^set]: represents the complement of set, where set is interpreted as above. + +For all classes represented by single letters (%a, %c, etc.), the corresponding uppercase letter represents the complement of the class. For instance, %S represents all non-space characters. +]] |
