--[[-- MIT License Copyright (c) 2018 Xavier Wang Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --]]-- local utf8 = require 'init' utf8.config = { debug = nil, --utf8:require("util").debug, } utf8:init() print('testing pattern matching') local function f(s, p) local i,e = utf8.find(s, p) if i then return utf8.sub(s, i, e) end end local function f1(s, p) p = utf8.gsub(p, "%%([0-9])", function (s) return "%" .. (tonumber(s)+1) end) p = utf8.gsub(p, "^(^?)", "%1()", 1) p = utf8.gsub(p, "($?)$", "()%1", 1) local t = {utf8.match(s, p)} return utf8.sub(s, t[1], t[#t] - 1) end local a,b = utf8.find('', '') -- empty patterns are tricky assert(a == 1 and b == 0); a,b = utf8.find('alo', '') assert(a == 1 and b == 0) a,b = utf8.find('a\0o a\0o a\0o', 'a', 1) -- first position assert(a == 1 and b == 1) a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 2) -- starts in the midle assert(a == 5 and b == 7) a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 9) -- starts in the midle assert(a == 9 and b == 11) a,b = utf8.find('a\0a\0a\0a\0\0ab', '\0ab', 2); -- finds at the end assert(a == 9 and b == 11); a,b = utf8.find('a\0a\0a\0a\0\0ab', 'b') -- last position assert(a == 11 and b == 11) assert(utf8.find('a\0a\0a\0a\0\0ab', 'b\0') == nil) -- check ending assert(utf8.find('', '\0') == nil) assert(utf8.find('alo123alo', '12') == 4) assert(utf8.find('alo123alo', '^12') == nil) assert(utf8.match("aaab", ".*b") == "aaab") assert(utf8.match("aaa", ".*a") == "aaa") assert(utf8.match("b", ".*b") == "b") assert(utf8.match("aaab", ".+b") == "aaab") assert(utf8.match("aaa", ".+a") == "aaa") assert(not utf8.match("b", ".+b")) assert(utf8.match("aaab", ".?b") == "ab") assert(utf8.match("aaa", ".?a") == "aa") assert(utf8.match("b", ".?b") == "b") assert(f('aloALO', '%l*') == 'alo') assert(f('aLo_ALO', '%a*') == 'aLo') assert(f(" \n\r*&\n\r xuxu \n\n", "%g%g%g+") == "xuxu") assert(f('aaab', 'a*') == 'aaa'); assert(f('aaa', '^.*$') == 'aaa'); assert(f('aaa', 'b*') == ''); assert(f('aaa', 'ab*a') == 'aa') assert(f('aba', 'ab*a') == 'aba') assert(f('aaab', 'a+') == 'aaa') assert(f('aaa', '^.+$') == 'aaa') assert(f('aaa', 'b+') == nil) assert(f('aaa', 'ab+a') == nil) assert(f('aba', 'ab+a') == 'aba') assert(f('a$a', '.$') == 'a') assert(f('a$a', '.%$') == 'a$') assert(f('a$a', '.$.') == 'a$a') assert(f('a$a', '$$') == nil) assert(f('a$b', 'a$') == nil) assert(f('a$a', '$') == '') assert(f('', 'b*') == '') assert(f('aaa', 'bb*') == nil) assert(f('aaab', 'a-') == '') assert(f('aaa', '^.-$') == 'aaa') assert(f('aabaaabaaabaaaba', 'b.*b') == 'baaabaaabaaab') assert(f('aabaaabaaabaaaba', 'b.-b') == 'baaab') assert(f('alo xo', '.o$') == 'xo') assert(f(' \n isto é assim', '%S%S*') == 'isto') assert(f(' \n isto é assim', '%S*$') == 'assim') assert(f(' \n isto é assim', '[a-z]*$') == 'assim') assert(f('um caracter ? extra', '[^%sa-z]') == '?') assert(f('', 'a?') == '') assert(f('á', 'á?') == 'á') assert(f('ábl', 'á?b?l?') == 'ábl') assert(f(' ábl', 'á?b?l?') == '') assert(f('aa', '^aa?a?a') == 'aa') assert(f(']]]áb', '[^]]') == 'á') assert(f("0alo alo", "%x*") == "0a") assert(f("alo alo", "%C+") == "alo alo") print('+') assert(f1('alo alx 123 b\0o b\0o', '(..*) %1') == "b\0o b\0o") assert(f1('axz123= 4= 4 34', '(.+)=(.*)=%2 %1') == '3= 4= 4 3') assert(f1('=======', '^(=*)=%1$') == '=======') assert(utf8.match('==========', '^([=]*)=%1$') == nil) local function range (i, j) if i <= j then return i, range(i+1, j) end end local abc = utf8.char(range(0, 255)); assert(utf8.len(abc) == 256) assert(string.len(abc) == 384) local function strset (p) local res = {s=''} utf8.gsub(abc, p, function (c) res.s = res.s .. c end) return res.s end; local a, b, c, d, e, t -- local E = utf8.escape -- assert(utf8.len(strset(E'[%200-%210]')) == 11) assert(strset('[a-z]') == "abcdefghijklmnopqrstuvwxyz") assert(strset('[a-z%d]') == strset('[%da-uu-z]')) assert(strset('[a-]') == "-a") assert(strset('[^%W]') == strset('[%w]')) assert(strset('[]%%]') == '%]') assert(strset('[a%-z]') == '-az') assert(strset('[%^%[%-a%]%-b]') == '-[]^ab') -- assert(strset('%Z') == strset(E'[%1-%255]')) -- assert(strset('.') == strset(E'[%1-%255%%z]')) print('+'); assert(utf8.match("alo xyzK", "(%w+)K") == "xyz") assert(utf8.match("254 K", "(%d*)K") == "") assert(utf8.match("alo ", "(%w*)$") == "") assert(utf8.match("alo ", "(%w+)$") == nil) assert(utf8.find("(álo)", "%(á") == 1) a, b, c, d, e = utf8.match("âlo alo", "^(((.).).* (%w*))$") assert(a == 'âlo alo' and b == 'âl' and c == 'â' and d == 'alo' and e == nil) a, b, c, d = utf8.match('0123456789', '(.+(.?)())') assert(a == '0123456789' and b == '' and c == 11 and d == nil) print('+') assert(utf8.gsub('ülo ülo', 'ü', 'x') == 'xlo xlo') assert(utf8.gsub('alo úlo ', ' +$', '') == 'alo úlo') -- trim assert(utf8.gsub(' alo alo ', '^%s*(.-)%s*$', '%1') == 'alo alo') -- double trim assert(utf8.gsub('alo alo \n 123\n ', '%s+', ' ') == 'alo alo 123 ') t = "abç d" a, b = utf8.gsub(t, '(.)', '%1@') assert('@'..a == utf8.gsub(t, '', '@') and b == 5) a, b = utf8.gsub('abçd', '(.)', '%0@', 2) assert(a == 'a@b@çd' and b == 2) assert(utf8.gsub('alo alo', '()[al]', '%1') == '12o 56o') assert(utf8.gsub("abc=xyz", "(%w*)(%p)(%w+)", "%3%2%1-%0") == "xyz=abc-abc=xyz") assert(utf8.gsub("abc", "%w", "%1%0") == "aabbcc") assert(utf8.gsub("abc", "%w+", "%0%1") == "abcabc") assert(utf8.gsub('áéí', '$', '\0óú') == 'áéí\0óú') assert(utf8.gsub('', '^', 'r') == 'r') assert(utf8.gsub('', '$', 'r') == 'r') print('+') assert(utf8.gsub("um (dois) tres (quatro)", "(%(%w+%))", utf8.upper) == "um (DOIS) tres (QUATRO)") do local function setglobal (n,v) rawset(_G, n, v) end utf8.gsub("a=roberto,roberto=a", "(%w+)=(%w%w*)", setglobal) assert(_G.a=="roberto" and _G.roberto=="a") end function f(a,b) return utf8.gsub(a,'.',b) end assert(utf8.gsub("trocar tudo em |teste|b| é |beleza|al|", "|([^|]*)|([^|]*)|", f) == "trocar tudo em bbbbb é alalalalalal") local function dostring (s) return (loadstring or load)(s)() or "" end assert(utf8.gsub("alo $a=1$ novamente $return a$", "$([^$]*)%$", dostring) == "alo novamente 1") x = utf8.gsub("$local utf8=require'init' x=utf8.gsub('alo', '.', utf8.upper)$ assim vai para $return x$", "$([^$]*)%$", dostring) assert(x == ' assim vai para ALO') local s,r t = {} s = 'a alo jose joao' r = utf8.gsub(s, '()(%w+)()', function (a,w,b) assert(utf8.len(w) == b-a); t[a] = b-a; end) assert(s == r and t[1] == 1 and t[3] == 3 and t[7] == 4 and t[13] == 4) local function isbalanced (s) return utf8.find(utf8.gsub(s, "%b()", ""), "[()]") == nil end assert(isbalanced("(9 ((8))(\0) 7) \0\0 a b ()(c)() a")) assert(not isbalanced("(9 ((8) 7) a b (\0 c) a")) assert(utf8.gsub("alo 'oi' alo", "%b''", '"') == 'alo " alo') local t = {"apple", "orange", "lime"; n=0} assert(utf8.gsub("x and x and x", "x", function () t.n=t.n+1; return t[t.n] end) == "apple and orange and lime") t = {n=0} utf8.gsub("first second word", "%w%w*", function (w) t.n=t.n+1; t[t.n] = w end) assert(t[1] == "first" and t[2] == "second" and t[3] == "word" and t.n == 3) t = {n=0} assert(utf8.gsub("first second word", "%w+", function (w) t.n=t.n+1; t[t.n] = w end, 2) == "first second word") assert(t[1] == "first" and t[2] == "second" and t[3] == nil) assert(not pcall(utf8.gsub, "alo", "(.", print)) assert(not pcall(utf8.gsub, "alo", ".)", print)) assert(not pcall(utf8.gsub, "alo", "(.", {})) assert(not pcall(utf8.gsub, "alo", "(.)", "%2")) assert(not pcall(utf8.gsub, "alo", "(%1)", "a")) --[[-- Stepets: ignoring this test because it's probably bug in Lua. %0 should be interpreted as capture reference only in replacement arg it doesn't have sense in pattern --]]-- -- assert(not pcall(utf8.gsub, "alo", "(%0)", "a")) -- bug since 2.5 (C-stack overflow) -- todo: benchmark OOM -- do -- local function f (size) -- local s = string.rep("a", size) -- local p = string.rep(".?", size) -- return pcall(utf8.match, s, p) -- end -- local r, m = f(80) -- assert(r and #m == 80) -- r, m = f(200000) -- assert(not r and utf8.find(m, "too complex")) -- end -- if not _soft then -- -- big strings -- local a = string.rep('a', 300000) -- assert(utf8.find(a, '^a*.?$')) -- assert(not utf8.find(a, '^a*.?b$')) -- assert(utf8.find(a, '^a-.?$')) -- -- bug in 5.1.2 -- a = string.rep('a', 10000) .. string.rep('b', 10000) -- assert(not pcall(utf8.gsub, a, 'b')) -- end -- recursive nest of gsubs local function rev (s) return utf8.gsub(s, "(.)(.+)", function (c,s1) return rev(s1)..c end) end local x = "abcdef" assert(rev(rev(x)) == x) -- gsub with tables assert(utf8.gsub("alo alo", ".", {}) == "alo alo") assert(utf8.gsub("alo alo", "(.)", {a="AA", l=""}) == "AAo AAo") assert(utf8.gsub("alo alo", "(.).", {a="AA", l="K"}) == "AAo AAo") assert(utf8.gsub("alo alo", "((.)(.?))", {al="AA", o=false}) == "AAo AAo") assert(utf8.gsub("alo alo", "().", {2,5,6}) == "256 alo") t = {}; setmetatable(t, {__index = function (t,s) return utf8.upper(s) end}) assert(utf8.gsub("a alo b hi", "%w%w+", t) == "a ALO b HI") -- tests for gmatch local a = 0 for i in utf8.gmatch('abcde', '()') do assert(i == a+1); a=i end assert(a==6) t = {n=0} for w in utf8.gmatch("first second word", "%w+") do t.n=t.n+1; t[t.n] = w end assert(t[1] == "first" and t[2] == "second" and t[3] == "word") t = {3, 6, 9} for i in utf8.gmatch ("xuxx uu ppar r", "()(.)%2") do assert(i == table.remove(t, 1)) end assert(#t == 0) t = {} for i,j in utf8.gmatch("13 14 10 = 11, 15= 16, 22=23", "(%d+)%s*=%s*(%d+)") do t[i] = j end a = 0 for k,v in pairs(t) do assert(k+1 == v+0); a=a+1 end assert(a == 3) -- tests for `%f' (`frontiers') assert(utf8.gsub("aaa aa a aaa a", "%f[%w]a", "x") == "xaa xa x xaa x") assert(utf8.gsub("[[]] [][] [[[[", "%f[[].", "x") == "x[]] x]x] x[[[") assert(utf8.gsub("01abc45de3", "%f[%d]", ".") == ".01abc.45de.3") assert(utf8.gsub("01abc45 de3x", "%f[%D]%w", ".") == "01.bc45 de3.") -- local u = utf8.escape -- assert(utf8.gsub("function", u"%%f[%1-%255]%%w", ".") == ".unction") -- assert(utf8.gsub("function", u"%%f[^%1-%255]", ".") == "function.") --[[-- Stepets: %z is Lua 5.1 class for representing \0 Lua 5.2, Lua 5.3 doesn't have it in documentation. So it's considered deprecated. --]]-- assert(utf8.find("a", "%f[a]") == 1) assert(utf8.find("a", "%f[^%z]") == 1) assert(utf8.find("a", "%f[^%l]") == 2) assert(utf8.find("aba", "%f[a%z]") == 3) assert(utf8.find("aba", "%f[%z]") == 4) assert(not utf8.find("aba", "%f[%l%z]")) assert(not utf8.find("aba", "%f[^%l%z]")) local i, e = utf8.find(" alo aalo allo", "%f[%S].-%f[%s].-%f[%S]") assert(i == 2 and e == 5) local k = utf8.match(" alo aalo allo", "%f[%S](.-%f[%s].-%f[%S])") assert(k == 'alo ') local a = {1, 5, 9, 14, 17,} for k in utf8.gmatch("alo alo th02 is 1hat", "()%f[%w%d]") do assert(table.remove(a, 1) == k) end assert(#a == 0) -- malformed patterns local function malform (p, m) m = m or "malformed" local r, msg = pcall(utf8.find, "a", p) assert(not r and utf8.find(msg, m)) end malform("[a") malform("[]") malform("[^]") malform("[a%]") malform("[a%") malform("%b", "unbalanced") malform("%ba", "unbalanced") malform("%") malform("%f", "missing") -- \0 in patterns assert(utf8.match("ab\0\1\2c", "[\0-\2]+") == "\0\1\2") assert(utf8.match("ab\0\1\2c", "[\0-\0]+") == "\0") assert(utf8.find("b$a", "$\0?") == 2) assert(utf8.find("abc\0efg", "%\0") == 4) assert(utf8.match("abc\0efg\0\1e\1g", "%b\0\1") == "\0efg\0\1e\1") assert(utf8.match("abc\0\0\0", "%\0+") == "\0\0\0") assert(utf8.match("abc\0\0\0", "%\0%\0?") == "\0\0") -- magic char after \0 assert(utf8.find("abc\0\0","\0.") == 4) assert(utf8.find("abcx\0\0abc\0abc","x\0\0abc\0a.") == 4) print('OK')