Module:User:Cscott/llpeg
Appearance
This module is rated as beta, and is ready for widespread use. It is still new and should be used with some caution to ensure the results are as expected. |
This is a port to Scribunto lua of lpeglabel, which itself is a minor extension of the lua lpeg parser generator. The API should be identical to that of lpeglabel, except that the regular expressions are "natively unicode"; that is, the .
pattern matches a single unicode codepoint, not a single byte as in the original.
These files have been packaged using a script to turn them into a single lua file.
Usage
[edit]{{#invoke:User:Cscott/llpeg|function_name}}
return (function()
local builders = {}
local function register(name, f)
builders[name] = f
end
register('advent.compat', function() return require [[Module:User:Cscott/compat]] end)
register('llpeg.types', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local CHARMAX = 0x7F -- maximum codepoint for charsets
-- metatable for pattern objects; will be filled in later
local metareg = {}
local enum = function(keys)
local Enum = {}
Enum.__index = Enum
function Enum:__tostring() return self.name end
function Enum:pairs() return keys end
function Enum:type() return Enum end
for name, value in pairs(keys) do
Enum[name] = setmetatable({ name = name, value = value }, Enum)
end
return Enum
end
local CapKind = enum{
close = "close", -- not used in trees */
position = "position",
const = "constant", -- ktable[key] is Lua constant
backref = "backref", -- ktable[key] is "name" of group to get capture
arg = "argument", -- 'key' is arg's number
simple = "simple", -- next node is pattern
table = "table", -- next node is pattern
["function"] = "function", -- ktable[key] is function; next node is pattern
acc = "acc", -- ktable[key] is function; next node is pattern
query = "query", -- ktable[key] is table; next node is pattern
string = "string", -- ktable[key] is string; next node is pattern
num = "num", -- numbered capture; 'key' is number of value to return
subst = "substitution", -- substitution capture; next node is pattern
fold = "fold", -- ktable[key] is function; next node is pattern
runtime = "runtime", -- not used in trees (is uses another type for tree)
group = "group", -- ktable[key] is group's "name"
}
local TTag = enum{
Char = "char", -- 'n' has unicode codepoint
Set = "set", -- 'set' has sparse array codepoint->true for codepoint <=CHARMAX
-- 'rest' indicates whether all codepoints > CHARMAX should be
-- part of the set (true) or not (false)
Any = "any",
True = "true",
False = "false",
UTFR = "utf8.range", --[[ range of UTF-8 codepoints;
'from' has initial codepoint; 'to' has final codepoint ]]--
Rep = "rep", -- 'sib1' *
Seq = "seq", -- 'sib1' 'sib2'
Choice = "choice", -- 'sib1' / 'sib2'
Not = "not", -- !'sib1'
And = "and", -- &'sib1'
Call = "call", -- 'sib2' is rule being called; otherwise same as TOpenCall
OpenCall = "opencall", -- 'key' is rule name
Rule = "rule", --[[ 'key' is rule name (but key == nil for unused rules);
'sib1' is rule's pattern pre-rule; 'sib2' is next rule;
'n' is rule's sequential number, 'name' is rule name (even
for unused rules) ]]--
XInfo = "xinfo", -- extra info (not used)
Grammar = "grammar", -- 'sib1' is initial (and first) rule, 'n' is # rules
Behind = "behind", -- 'sib1' is pattern, 'n' is how much to go back
Capture = "capture", --[[ captures: 'cap' is kind of capture (enum 'CapKind');
'key' is Lua value associated with capture;
'sib1' is capture body ]]--
RunTime = "run-time", --[[ run-time capture: 'key' is Lua function;
'sib1' is capture body ]]--
Throw = "throw", -- labeled failure: 'key' is label's name,
-- sib2 is associated recovery rule
}
local PE = enum{
nullable = "nullable",
nofail = "nofail",
}
-- virtual machine instructions
local Opcode = enum{
Any = "any", -- if no char, fail
Char = "char", -- if char != aux, fail
Set = "set", -- if char not in buff, fail
TestAny = "testany", -- in no char, jump to 'offset'
TestChar = "testchar", -- if char != aux, jump to 'offset'
TestSet = "testset", -- if char not in buff, jump to 'offset'
Span = "span", -- read a span of chars in buff
UTFR = "utf-range", -- if codepoint not in range [offset, utf_to], fail
Behind = "behind", -- walk back 'aux' characters (fail if not possible)
Ret = "ret", -- return from a rule
End = "end", -- end of pattern
Choice = "choice", -- stack a choice; next fail will jump to 'offset'
PredChoice = "pred_choice", -- labeled failure: stack a choice; changes label env next fail will jump to 'offset'
Jmp = "jmp", -- jump to 'offset'
Call = "call", -- call rule at 'offset'
OpenCall = "open_call", -- call rule number 'key' (must be closed to a ICall)
Commit = "commit", -- pop choice and jump to 'offset'
PartialCommit = "partial_commit", -- update top choice to current position and jump
BackCommit = "back_commit", -- backtrack like "fail" but jump to its own 'offset'
FailTwice = "failtwice", -- pop one choice and then fail
Fail = "fail", -- go back to saved state on choice and jump to saved offset
Giveup = "giveup", -- internal use
FullCapture = "fullcapture", -- complete capture of last 'off' chars
OpenCapture = "opencapture", -- start a capture
CloseCapture = "closecapture",
CloseRunTime = "closeruntime",
Throw = "throw", -- fails with a given label --labeled failure
ThrowRec = "throw_rec", -- fails with a given label and call rule at 'offset' --labeled failure
Empty = "--", -- to fill empty slots left by optimizations
}
-- helper for visitor pattern definitions
function define(dispatch, which, f)
for _,v in pairs(which) do
assert(v ~= nil) -- catch typos
dispatch[v] = f
end
end
local numsiblings = {}
define(numsiblings, {
TTag.Char, TTag.Set, TTag.Any,
TTag.True, TTag.False, TTag.UTFR,
TTag.Call, TTag.OpenCall,
TTag.Throw,
}, 0)
define(numsiblings, {
TTag.Rep, TTag.Not, TTag.And, TTag.Grammar,
TTag.Behind, TTag.Capture, TTag.RunTime,
}, 1)
define(numsiblings, {
TTag.Seq, TTag.Choice, TTag.Rule,
}, 2)
-- more help for visitor functions
local function_name_registry = {}
function register_fname(name, f)
assert(type(name) == "string")
assert(type(f) == "function")
function_name_registry[f] = name
end
function report_ferror(f, msg)
local fname = function_name_registry[f]
if fname ~= nil then
msg = fname .. ": " .. msg
end
error(msg)
end
function define_type_visitor(tbl)
local dispatch = {}
for keys,func in pairs(tbl) do
if type(keys) ~= "table" then
keys = { keys }
end
define(dispatch, keys, func)
end
local visit
visit = function(val, ...)
local a = dispatch["assert"]
if a ~= nil then a(val, ...) end -- assert preconditions
local ty = type(val)
if ty == 'table' and getmetatable(val) == metareg then
ty = 'pattern'
end
local f = dispatch[ty]
if f ~= nil then return f(val, ...) end
f = dispatch.default
if f == nil then
report_ferror(visit, "no default for " .. ty)
end
return f(val, ...)
end
return visit
end
function define_tree_visitor(tbl, opt_name)
local dispatch = {}
for keys,func in pairs(tbl) do
if type(keys) ~= "table" or getmetatable(keys) == TTag then
keys = { keys }
end
define(dispatch, keys, func)
end
local visit
visit = function(tree, ...)
if tree == nil then report_ferror(visit, "nil tree") end
local a = dispatch["assert"]
if a ~= nil then a(tree, ...) end -- assert preconditions
local f = dispatch[tree.tag]
if f ~= nil then return f(tree, ...) end
f = dispatch.default
if f == nil then
report_ferror(visit, "no default for " .. tree.tag)
end
return f(tree, ...)
end
return visit
end
function define_opcode_visitor(tbl)
local dispatch = {}
for keys,func in pairs(tbl) do
if type(keys) ~= "table" or getmetatable(keys) == Opcode then
keys = { keys }
end
define(dispatch, keys, func)
end
local visit
visit = function(op, ...)
if op == nil then report_ferror(visit, "nil op") end
local a = dispatch["assert"]
if a ~= nil then a(op, ...) end -- assert preconditions
local f = dispatch[op.code]
if f ~= nil then return f(op, ...) end
f = dispatch.default
if f == nil then
report_ferror(visit, "no default for " .. op.code)
end
return f(op, ...)
end
return visit
end
-- helper for module imports
function from(mod, list)
local result = {}
for _,v in ipairs(list) do
table.insert(result, mod[v])
end
return compat.unpack(result)
end
function newcharset()
return setmetatable({
tag = TTag.Set,
code = nil,
rest = false,
set = {}
}, metareg)
end
local fullset = newcharset()
for i = 0,CHARMAX do
fullset.set[i] = true
end
fullset.rest = true -- make sure non-ascii unicode chars are included!
assert(fullset.tag == TTag.Set)
return {
CHARMAX = CHARMAX,
CapKind = CapKind,
Opcode = Opcode,
PE = PE,
TTag = TTag,
define = define,
define_tree_visitor = define_tree_visitor,
enum = enum,
from = from,
fullset = fullset,
metareg = metareg,
newcharset = newcharset,
numsiblings = numsiblings,
register_fname = register_fname,
}
end)
register('llpeg.print', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local from = myrequire('llpeg.types').from
local
CHARMAX,
CapKind,
Opcode,
TTag,
define,
define_tree_visitor,
numsiblings,
_ = from(myrequire('llpeg.types'), {
'CHARMAX',
'CapKind',
'Opcode',
'TTag',
'define',
'define_tree_visitor',
'numsiblings',
})
function printcharset(tree)
local result = "["
local i = 0
while i <= CHARMAX do
local first = i
while tree.set[i] and i <= CHARMAX do
i = i + 1
end
if first == (i - 1) then -- unary range
result = result .. string.format("(%02x)", first)
elseif first < (i-1) then -- non-empty range
result = result .. string.format("(%02x-%02x)", first, i - 1)
end
i = i + 1
end
if tree.rest then
result = result .. "(80-FFFF)"
end
return result .. "]"
end
function printjmp(op, pc)
return "-> " .. op.target
end
local printinst_helper = define_opcode_visitor{
[Opcode.Char] = function(op, pc)
return string.format("'%c' (%02x)", op.aux, op.aux)
end,
[Opcode.TestChar] = function(op, pc)
return string.format("'%c' (%02x)", op.aux, op.aux) .. printjmp(op, pc)
end,
[Opcode.UTFR] = function(op, pc)
return string.format("%d - %d", op.from, op.to)
end,
[Opcode.FullCapture] = function(op, pc)
return string.format("%s (size = %s) (idx = %s)",
op.cap.value, op.aux, op.key)
end,
[Opcode.OpenCapture] = function(op, pc)
return string.format("%s (idx = %s)",
op.cap.value, op.key)
end,
[Opcode.Set] = function(op, pc)
return printcharset(op)
end,
[Opcode.TestSet] = function(op, pc)
return printcharset(op) .. printjmp(op, pc)
end,
[Opcode.Span] = function(op, pc)
return printcharset(op)
end,
[Opcode.OpenCall] = function(op, pc)
return string.format("-> %d", op.target) -- rule number
end,
[Opcode.Behind] = function(op, pc)
return string.format("%d", op.aux)
end,
[{Opcode.Jmp, Opcode.Call, Opcode.Commit, Opcode.Choice,
Opcode.PartialCommit, Opcode.BackCommit, Opcode.TestAny,
Opcode.PredChoice}] = function(op, pc)
return printjmp(op, pc)
end,
[Opcode.Throw] = function(op, pc) -- labeled failure
return string.format("(idx = %s)", op.key)
end,
[Opcode.ThrowRec] = function(op, pc)
return printjmp(op, pc) .. string.format("(idx = %s)", op.key)
end,
default = function() return '' end,
}
function printinst(pc, op, accum)
table.insert(accum, string.format("%02d: %s ", pc, op.code.value))
table.insert(accum, printinst_helper(op, pc))
table.insert(accum, "\n")
return accum
end
function printpatt(code, accum)
for pc,op in ipairs(code) do
printinst(pc, op, accum)
end
return accum
end
function printcap(cap, indent)
print(string.format("%s%s", string.rep(' ', indent), cap))
end
function printcap2close(captures, ncaptures, i, indent)
local head = captures[i]
i = i + 1
printcap(head, indent) -- print head capture
while i <= ncaptures and head:inside(captures[i]) do
i = printcap2close(captures, ncaptures, i, indent + 2) -- print nested captures
end
if i <= ncaptures and head:isopencap() then
assert(captures[i]:isclosecap())
printcap(captures[i], indent) -- print and skip close capture
i = i + 1
end
return i
end
function printcaplist(captures, ncaptures)
-- for debugging, first print a raw list of captures
if ncaptures == nil then ncaptures = #captures end
for i=1,ncaptures do
printcap(captures[i], 0)
end
print(">======");
local i=1
while i <= ncaptures and not captures[i]:isclosecap() do
i = printcap2close(captures, ncaptures, i, 0)
end
if i > ncaptures then
print("<unmatched>")
end
print("=======");
end
local printtree_helper = define_tree_visitor{
[TTag.Char] = function(tree)
local c = compat.utf8char(tree.n)
if c:find("%C") ~= nil then -- printable?
return " '" .. c .. "'"
else
return string.format(" (%02X)", tree.n)
end
end,
[TTag.Set] = function(tree)
return printcharset(tree)
end,
[TTag.UTFR] = function(tree)
return " " .. tree.from .. " - " .. tree.to
end,
[{TTag.OpenCall, TTag.Call}] = function(tree)
local ret = string.format(" key: %s", tree.key)
local rule = tree.sib2
if rule ~= nil then
ret = ret .. " (rule: " .. rule.n .. ")"
end
return ret
end,
[TTag.Behind] = function(tree)
return " " .. tree.n
end,
[TTag.Capture] = function(tree)
return string.format(" kind: '%s' key: %s", tree.cap.value, tree.key)
end,
[TTag.Rule] = function(tree)
return string.format(" key: %s", tree.key)
end,
[TTag.XInfo] = function(tree)
return " n: " .. tree.n
end,
[TTag.Grammar] = function(tree)
return " " .. tree.n -- number of rules
end,
[TTag.Throw] = function(tree)
return string.format(" key: %s", tree.key)
end,
default = function(tree) return '' end
}
function printtree(tree, indent, accum)
local sibs = numsiblings[tree.tag]
table.insert(accum, string.rep(' ', indent))
table.insert(accum, tree.tag.value)
table.insert(accum, printtree_helper(tree))
table.insert(accum, "\n")
if tree.tag == TTag.Rule then
sibs = 1 -- don't print sib2
elseif tree.tag == TTag.Grammar then
local rule = tree.sib1
for i=1,tree.n do
printtree(rule, indent + 2, accum)
rule = rule.sib2
end
sibs = 0 -- siblings already handled
end
if sibs >= 1 then
printtree(tree.sib1, indent + 2, accum)
if sibs >= 2 then
printtree(tree.sib2, indent + 2, accum)
end
end
return accum
end
local PREFIX = "" -- could also be "l." or "lpeg." etc
local printrepl_helper
printrepl_helper = define_tree_visitor{
[TTag.True] = function(tree, buf)
table.insert(buf, PREFIX .. 'P""')
end,
[TTag.Any] = function(tree, buf)
table.insert(buf, PREFIX .. 'P(1)')
end,
[TTag.Char] = function(tree, buf)
table.insert(buf, PREFIX .. 'P"')
local c = compat.utf8char(tree.n)
if c:find("%C") ~= nil then -- printable?
table.insert(buf, c)
else
table.insert(buf, string.format('\\%02X', tree.n))
end
table.insert(buf, '"')
end,
[TTag.Set] = function(tree, buf)
local nbuf = {}
local insertchar = function(cp)
local c = compat.utf8char(cp)
if string.find(c, "^[^%w%p ]") ~= nil then
table.insert(nbuf, string.format('\\x%02X', cp))
else
table.insert(nbuf, c)
end
end
local nargs = 0
local inserttwo = function(cp1, cp2)
if nargs > 0 then table.insert(nbuf, ',') end
nargs = nargs + 1
table.insert(nbuf, '"')
insertchar(cp1)
insertchar(cp2)
table.insert(nbuf, '"')
end
local i = 0
while i <= CHARMAX do
local first = i
while tree.set[i] and i <= CHARMAX do
i = i + 1
end
if first == (i - 1) then -- unary range
inserttwo(first, first)
elseif first < (i-1) then -- non-empty range
inserttwo(first, i-1)
end
i = i + 1
end
local r = table.concat(nbuf)
if nargs == 1 then
r = PREFIX .. 'S' .. r
else
r = PREFIX .. 'S(' .. r .. ')'
end
if tree.rest then
table.insert(buf, '(')
table.insert(buf, r)
table.insert(buf, ' + ')
table.insert(buf, PREFIX)
table.insert(buf, 'utfR(0x80, 0x10FFFF))')
else
table.insert(buf, r)
end
end,
[TTag.UTFR] = function(tree, buf)
table.insert(buf, string.format("%sutfR(0x%04X, 0x%04X)", PREFIX, tree.from, tree.to))
end,
[{TTag.OpenCall, TTag.Call}] = function(tree, buf)
table.insert(buf, string.format('%sV"%s"', PREFIX, tree.key))
end,
[TTag.Not] = function(tree, buf)
table.insert(buf, '-(')
printrepl_helper(tree.sib1, buf)
table.insert(buf, ')')
end,
[TTag.Seq] = function(tree, buf)
table.insert(buf, "(")
printrepl_helper(tree.sib1, buf)
table.insert(buf, " * ")
printrepl_helper(tree.sib2, buf)
table.insert(buf, ")")
end,
[TTag.Choice] = function(tree, buf)
table.insert(buf, "(")
printrepl_helper(tree.sib1, buf)
table.insert(buf, " + ")
printrepl_helper(tree.sib2, buf)
table.insert(buf, ")")
end,
[TTag.Rep] = function(tree, buf)
printrepl_helper(tree.sib1, buf)
table.insert(buf, "^0")
end,
--[[
[TTag.Behind] = function(tree)
return " " .. tree.n
end,
]]--
[TTag.Capture] = function(tree, buf)
local repl = define_type_visitor{
string = function(v)
return '"' .. v .. '"' -- xxx should handle escapes
end,
default = tostring,
}
local name = nil
local show_patt = false
local show_key = false
if tree.cap == CapKind.simple then
name = 'C'
show_patt = true
elseif tree.cap == CapKind.subst then
name = 'Cs'
show_patt = true
elseif tree.cap == CapKind.table then
name = 'Ct'
show_patt = true
elseif tree.cap == CapKind.pos then
name = 'Cp'
elseif tree.cap == CapKind.arg then
name = 'Carg'
show_key = true
elseif tree.cap == CapKind.backref then
name = 'Cb'
show_key = true
elseif tree.cap == CapKind.group then
name = 'Cg'
show_patt = true
show_key = (tree.key ~= nil)
end
if name ~= nil then
table.insert(buf, PREFIX)
table.insert(buf, name)
table.insert(buf, '(')
if show_patt then
printrepl_helper(tree.sib1, buf)
if show_key then
table.insert(buf, ', ')
end
end
if show_key then
table.insert(buf, repl(tree.key))
end
table.insert(buf, ')')
return
end
if tree.cap == CapKind.string or
tree.cap == CapKind.num or
tree.cap == CapKind.query or
tree.cap == CapKind['function'] then
printrepl_helper(tree.sib1, buf)
table.insert(buf, ' / ')
table.insert(buf, repl(tree.key))
return
end
-- fallback
table.insert(buf, string.format("<pattern %s>", tostring(tree.tag)))
end,
[TTag.Rule] = function(tree, buf)
local key = tree.name
if type(key) == 'number' then key = string.format("[%d]", key) end
table.insert(buf, key)
table.insert(buf, " = ")
printrepl_helper(tree.sib1, buf)
end,
[TTag.Grammar] = function(tree, buf)
table.insert(buf, PREFIX .. "P{")
local rule = tree.sib1
local r = {}
local first_rule_name = rule.name
r[first_rule_name] = rule
rule = rule.sib2
local names = {}
for i=2,tree.n do
table.insert(names, rule.name)
r[rule.name] = rule
rule = rule.sib2
end
-- sort rule names
table.sort(names)
table.insert(names, 1, first_rule_name)
-- now print in order
for _,name in ipairs(names) do
printrepl_helper(r[name], buf)
table.insert(buf, ", ")
end
table.insert(buf, "}")
end,
--[[
[TTag.Throw] = function(tree)
return " key: " .. tree.key .. " (rule: " .. tree.sib2.cap .. ")"
end,
]]--
default = function(tree, buf)
table.insert(buf, string.format("<pattern %s>", tostring(tree.tag)))
end,
}
function printrepl(tree)
local buf = {}
printrepl_helper(tree, buf)
return table.concat(buf)
end
return {
printcaplist = printcaplist,
printcharset = printcharset,
printinst = printinst,
printpatt = printpatt,
printrepl = printrepl,
printtree = printtree,
}
end)
register('llpeg.code', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local from = myrequire('llpeg.types').from
local
CHARMAX,
CapKind,
Opcode,
PE,
TTag,
define,
define_tree_visitor,
fullset,
newcharset,
numsiblings,
register_fname,
_ = from(myrequire('llpeg.types'), {
'CHARMAX',
'CapKind',
'Opcode',
'PE',
'TTag',
'define',
'define_tree_visitor',
'fullset',
'newcharset',
'numsiblings',
'register_fname',
})
local printinst = myrequire('llpeg.print').printinst
local TRACE_INSTRUCTIONS = false
-- signals a "no-instruction"
local NOINST = nil
-- don't optimize captures longer than this
local MAXOFF = 15
-- forward declarations
local codegen
local CompileState = {}
CompileState.__index = CompileState
--[[
** {======================================================
** Analysis and some optimizations
** =======================================================
]]--
--[[
** Check whether a charset is empty (returns IFail), singleton (IChar),
** full (IAny), or none of those (ISet). When singleton, '*c' returns
** which character it is. (When generic set, the set was the input,
** so there is no need to return it.)
]]--
function charsettype(cs)
local count = 0
local candidate
for i,_ in pairs(cs.set) do
candidate = i
count = count + 1
end
if cs.rest then
if count == (CHARMAX + 1) then
return Opcode.Any -- full set
end
elseif count == 0 then
return Opcode.Fail -- empty set
elseif count == 1 then
return Opcode.Char, candidate -- single char
end
return Opcode.Set -- neither full nor empty nor singleton
end
-- A few basic operations on charsets; returns new object
function cs_clone(cs)
local result = newcharset()
for i,_ in pairs(cs.set) do
result.set[i] = true
end
result.rest = cs.rest
return result
end
function cs_complement(cs)
local result = newcharset()
for i=0,CHARMAX do
if not cs.set[i] then
result.set[i] = true
end
end
result.rest = not cs.rest
return result
end
function cs_intersection(a, b)
local result = newcharset()
for i,_ in pairs(a.set) do
if a.set[i] and b.set[i] then
result.set[i] = true
end
end
result.rest = a.rest and b.rest
return result
end
function cs_union(a, b)
local result = newcharset()
for i=0,CHARMAX do
if a.set[i] or b.set[i] then
result.set[i] = true
end
end
result.rest = a.rest or b.rest
return result
end
function cs_diff(a, b)
local result = newcharset()
for i=0,CHARMAX do
if a.set[i] and not b.set[i] then
result.set[i] = true
end
end
result.rest = a.rest and not b.rest
return result
end
function cs_disjoint(a, b)
if a.rest == b.rest then return false end
for i,_ in pairs(a.set) do
if b.set[i] then return false end
end
for i,_ in pairs(b.set) do
if a.set[i] then return false end
end
return true
end
function cs_equal(a, b)
if a.rest ~= b.rest then return false end
for i,_ in pairs(a.set) do
if not b.set[i] then return false end
end
for i,_ in pairs(b.set) do
if not a.set[i] then return false end
end
return true
end
--[[
** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a
** charset and return it; else return nil.
]]--
local tocharset = define_tree_visitor{
[TTag.Set] = function(v)
return v -- copy set
end,
[TTag.Char] = function(v)
-- only one char
if v.n <= CHARMAX then
local t = newcharset()
t.set[v.n] = true
return t
else
return nil
end
end,
[TTag.Any] = function(v)
return fullset
end,
[TTag.False] = function(v)
return newcharset()
end,
default = function(v)
return nil
end,
}
register_fname("tocharset", tocharset)
--[[
** Visit a TCall node taking care to stop recursion. If node not yet
** visited, return 'f(rule for call)', otherwise return 'def' (default
** value)
]]--
function CompileState:callrecursive(tree, f, default_value, ...)
if tree.tag ~= TTag.Call then
error("unexpected tree tag")
end
local rule = self.grammar.ruletab[tree.key]
if rule.tag ~= TTag.Rule then
error("unexpected tree sibling")
end
if tree.seen == true then
return default_value -- node already visited
else
-- first visit
local oldseen = tree.seen
tree.seen = true
local result = f(rule, ...)
tree.seen = oldseen -- restore tree
return result
end
end
--[[
** Check whether a pattern tree has captures
]]--
local hascaptures
hascaptures = define_tree_visitor{
[{TTag.Capture, TTag.RunTime}] = function(tree, cs)
return true
end,
[TTag.Call] = function(tree, cs)
assert(cs ~= nil)
return cs:callrecursive(tree, hascaptures, false, cs)
end,
[TTag.Rule] = function(tree, cs)
-- do not follow siblings
return hascaptures(tree.sib1, cs)
end,
[TTag.OpenCall] = function(tree, cs)
error("should not happen")
end,
[TTag.Grammar] = function(tree, cs)
-- make a fake compile state to hold the grammar, if necessary
if cs == nil then cs = CompileState:new(nil) end
return cs:withGrammar(tree, hascaptures, tree.sib1, cs)
end,
default = function(tree, cs)
local n = numsiblings[tree.tag]
if n == 1 then
return hascaptures(tree.sib1, cs) -- tail call
elseif n == 2 then
if hascaptures(tree.sib1, cs) then return true end
return hascaptures(tree.sib2, cs) -- tail call
elseif n == 0 then
return false
else
error("how many siblings does this have?")
end
end,
}
function CompileState:hascaptures(t) return hascaptures(t, self) end
register_fname("hascaptures", hascaptures)
--[[
** Checks how a pattern behaves regarding the empty string,
** in one of two different ways:
** A pattern is *nullable* if it can match without consuming any character;
** A pattern is *nofail* if it never fails for any string
** (including the empty string).
** The difference is only for predicates and run-time captures;
** for other patterns, the two properties are equivalent.
** (With predicates, &'a' is nullable but not nofail. Of course,
** nofail => nullable.)
** These functions are all convervative in the following way:
** p is nullable => nullable(p)
** nofail(p) => p cannot fail
** The function assumes that TOpenCall is not nullable;
** this will be checked again when the grammar is fixed.
** Run-time captures can do whatever they want, so the result
** is conservative.
]]--
local checkaux
checkaux = define_tree_visitor{
[{
TTag.Char, TTag.Set, TTag.Any, TTag.UTFR, TTag.False,
TTag.OpenCall, TTag.Throw,
}] = function(tree, pred, cs)
return false -- not nullable
end,
[{TTag.Rep,TTag.True}] = function(tree, pred, cs)
return true -- no fail
end,
[{TTag.Not,TTag.Behind}] = function(tree, pred, cs)
-- can match empty, but can fail
if pred == PE.nofail then
return false
else
return true
end
end,
[TTag.And] = function(tree, pred, cs)
-- can match empty; fail iff body does
if pred == PE.nullable then
return true
end
return checkaux(tree.sib1, pred, cs) -- tail call
end,
[TTag.RunTime] = function(tree, pred, cs)
-- can fail; match empty iff body does
if pred == PE.nofail then
return false
end
return checkaux(tree.sib1, pred, cs) -- tail call
end,
[TTag.Seq] = function(tree, pred, cs)
if not checkaux(tree.sib1, pred, cs) then
return false
end
return checkaux(tree.sib2, pred, cs) -- tail call
end,
[TTag.Choice] = function(tree, pred, cs)
if checkaux(tree.sib2, pred, cs) then
return true
end
return checkaux(tree.sib1, pred, cs) -- tail call
end,
[{ TTag.Capture, TTag.Rule, TTag.XInfo, }] = function(tree, pred, cs)
return checkaux(tree.sib1, pred, cs)
end,
[TTag.Grammar] = function(tree, pred, cs)
-- make a fake compile state to hold the grammar, if necessary
if cs == nil then cs = CompileState:new(nil) end
return cs:withGrammar(tree, checkaux, tree.sib1, pred, cs)
end,
[TTag.Call] = function(tree, pred, cs)
-- open calls are assumed not nullable; checked again after grammar
-- is fixed
if cs == nil then return false end
return checkaux(cs.grammar.ruletab[tree.key], pred, cs)
end,
}
register_fname("checkaux", checkaux)
function nofail(t, cs) return checkaux(t, PE.nofail, cs) end
function CompileState:nofail(t) return nofail(t, self) end
function nullable(t, cs) return checkaux(t, PE.nullable, cs) end
function CompileState:nullable(t) return nullable(t, self) end
function nullable_with_grammar(t, grm)
local cs = CompileState:new(nil)
return cs:withGrammar(grm, nullable, t, cs)
end
-- Note that we are counting characters, not bytes
local fixedlen, fixedlen_helper
fixedlen_helper = define_tree_visitor{
[{TTag.Char, TTag.Set, TTag.Any, TTag.UTFR}] = function(tree, len)
return len + 1
end,
[{TTag.False, TTag.True, TTag.Not, TTag.And, TTag.Behind}] = function(tree, len)
return len
end,
[{TTag.Rep, TTag.RunTime, TTag.OpenCall, TTag.Throw,}] = function(tree, len)
return -1 -- variable
end,
[{TTag.Capture, TTag.Rule, TTag.XInfo,}] = function(tree, len, cs)
return fixedlen_helper(tree.sib1, len, cs)
end,
[TTag.Grammar] = function(tree, len, cs)
-- make a fake compile state to hold the grammar, if necessary
if cs == nil then cs = CompileState:new(nil) end
return cs:withGrammar(tree, fixedlen_helper, tree.sib1, len, cs)
end,
[TTag.Call] = function(tree, len, cs)
-- If evaluating outside the context of a grammar, conservatively
-- return "variable"
if cs == nil then return -1 end
-- otherwise, carefully recurse
local n1 = cs:callrecursive(tree, fixedlen, -1, cs)
if n1 < 0 then return -1 end -- variable
return len + n1
end,
[TTag.Seq] = function(tree, len, cs)
local n1 = fixedlen_helper(tree.sib1, len, cs)
if n1 < 0 then return -1 end -- variable
return fixedlen_helper(tree.sib2, n1, cs)
end,
[TTag.Choice] = function(tree, len, cs)
local n1 = fixedlen_helper(tree.sib1, len, cs)
local n2 = fixedlen_helper(tree.sib2, len, cs)
if n1 ~= n2 or n1 < 0 then
return -1
else
return n1
end
end,
}
function fixedlen(tree, cs)
return fixedlen_helper(tree, 0, cs) -- supply default 0 for 'len'
end
function CompileState:fixedlen(t) return fixedlen(t, self) end
register_fname("fixedlen_helper", fixedlen_helper)
--[[
** Computes the 'first set' of a pattern.
** The result is a conservative aproximation:
** match p ax -> x (for some x) ==> a belongs to first(p)
** or
** a not in first(p) ==> match p ax -> fail (for all x)
**
** The set 'follow' is the first set of what follows the
** pattern (full set if nothing follows it).
**
** The function returns 0 when this resulting set can be used for
** test instructions that avoid the pattern altogether.
** A non-zero return can happen for two reasons:
** 1) match p '' -> '' ==> return has bit 1 set
** (tests cannot be used because they would always fail for an empty input);
** 2) there is a match-time capture ==> return has bit 2 set
** (optimizations should not bypass match-time captures).
]]--
local getfirst
getfirst = define_tree_visitor{
[TTag.Char] = function(t, follow, cs)
if t.n <= CHARMAX then return 0, tocharset(t) end
-- conservative approximation!
local s = newcharset()
s.rest = true
return 0, s
end,
[{ TTag.Set, TTag.Any, TTag.False }] = function(t, follow, cs)
return 0, tocharset(t)
end,
[TTag.UTFR] = function(t, follow, cs)
-- conservative approximation!
local firstset = newcharset()
if t.from <= CHARMAX then
for i=t.from, math.min(CHARMAX, t.to) do
firstset.set[i] = true
end
end
if t.to > CHARMAX then
-- conservative approximation of non-ascii unicode range
firstset.rest = true
end
return 0, firstset
end,
[TTag.True] = function(t, follow, cs)
return 1, follow -- 1 because this accepts the empty string
end,
[TTag.Throw] = function(t, follow, cs)
-- labeled failure: must always throw the label
return 1, fullset
end,
[TTag.Choice] = function(t, follow, cs)
local firstset = newcharset()
local e1,e1set = getfirst(t.sib1, follow, cs)
local e2,e2set = getfirst(t.sib2, follow, cs)
local firstset = cs_union(e1set, e2set)
local ret = 0 -- awkward lua5.1 way to say "e1 | e2"
if (e1 % 2) == 1 or (e2 % 2) == 1 then
ret = ret + 1
end
e1,e2 = compat.rshift(e1, 1), compat.rshift(e2, 1)
if (e1 % 2) == 1 or (e2 % 2) == 1 then
ret = ret + 2
end
return ret, firstset
end,
[TTag.Seq] = function(t, follow, cs)
if not nullable(t.sib1, cs) then
-- when p1 is not nullable, p2 has nothing to contribute
return getfirst(t.sib1, fullset, cs) -- tail call
else -- FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl))
local e2,csaux = getfirst(t.sib2, follow, cs)
local e1,firstset = getfirst(t.sib1, csaux, cs)
if e1 == 0 then
return 0, firstset -- 'e1' ensures that first can be used
elseif compat.rshift(e1, 1) % 2 == 1 or compat.rshift(e2, 1) % 2 == 1 then
-- one of the children has a matchtime?
return 2, firstset -- pattern has a matchtime capture
else
return e2, firstset -- else depends on e2
end
end
end,
[TTag.Rep] = function(t, follow, cs)
local _,firstset = getfirst(t.sib1, follow, cs)
return 1, cs_union(firstset, follow, cs) -- accepts the empty string
end,
[{ TTag.Capture,TTag.Rule,TTag.XInfo }] = function(t, follow, cs)
return getfirst(t.sib1, follow, cs) -- tail call
end,
[TTag.Grammar] = function(t, follow, cs)
return cs:withGrammar(t, getfirst, t.sib1, follow, cs)
end,
[TTag.RunTime] = function(t, follow, cs)
-- function invalidates any follow info
local e,firstset = getfirst(t.sib1, fullset, cs)
if e ~= 0 then
-- function is not "protected"?
return 2,firstset
else
-- pattern inside capture ensures first can be used
return 0,firstset
end
end,
[TTag.Call] = function(t, follow, cs)
local rule = cs.grammar.ruletab[t.key]
return getfirst(rule, follow, cs) -- tail call
end,
[TTag.And] = function(t, follow, cs)
local e,firstset = getfirst(t.sib1, follow, cs)
return e, cs_intersection(firstset, follow, cs)
end,
[{ TTag.Not, TTag.Behind }] = function(t, follow, cs)
if t.tag == TTag.Not then
local firstset = tocharset(t.sib1)
if firstset ~= nil then
return 1,cs_complement(firstset) -- could match empty input
end
end
-- the TNot or TBehind gives no new information
-- call getfirst only to check for math-time captures
local e,_ = getfirst(t.sib1, follow, cs)
if e%2 == 0 then e = e + 1 end -- set the lsb; could match empty input
return e, follow -- uses follow
end,
}
function CompileState:getfirst(t, follow) return getfirst(t, follow, self) end
register_fname("getfirst", getfirst)
--[[
** If 'headfail(tree)' true, then 'tree' can fail only depending on the
** next character of the subject.
-- ie, a single character of lookahead is enough to evaluate the pattern
-- rooted at this node
]]--
local headfail
headfail = define_tree_visitor{
[{TTag.Char, TTag.Set, TTag.Any,
TTag.False}] = function(t, cs)
return true
end,
[{TTag.True, TTag.Rep, TTag.RunTime, TTag.Not,
-- even though we are codepoint-based, we don't have a TestUTFR instruction
-- so we can't use a Test instruction on the first character, which is
-- implicitly what headfail is checking for.
TTag.UTFR,
TTag.Behind, TTag.Throw}] = function(t, cs)
return false
end,
[{TTag.Capture, TTag.Rule,
TTag.XInfo, TTag.And}] = function(t, cs)
return headfail(t.sib1, cs) -- tail call
end,
[TTag.Grammar] = function(t, cs)
return cs:withGrammar(t, headfail, t.sib1, cs)
end,
[TTag.Call] = function(t, cs)
local rule = cs.grammar.ruletab[t.key]
return headfail(rule, cs) -- tail call
end,
[TTag.Seq] = function(t, cs)
if not nofail(t.sib2, cs) then
-- if the second child could possibly fail, then we can't
-- evaluate the entire seq based just on the first child
return false
end
return headfail(t.sib1, cs) -- tail call
end,
[TTag.Choice] = function(t, cs)
-- both children need to be headfail for this to be headfail
if not headfail(t.sib1, cs) then
return false
end
return headfail(t.sib2, cs) -- tail call
end,
}
function CompileState:headfail(t) return headfail(t, self) end
register_fname("headfail", headfail)
--[[
** Check whether the code generation for the given tree can benefit
** from a follow set (to avoid computing the follow set when it is
** not needed)
]]--
local needfollow
needfollow = define_tree_visitor{
[{TTag.Char, TTag.Set, TTag.Any, TTag.UTFR,
TTag.False, TTag.True, TTag.And, TTag.Not,
TTag.RunTime, TTag.Grammar, TTag.Call, TTag.Behind,
TTag.Throw, }] = function(tree) return false end,
[{TTag.Choice, TTag.Rep}] = function(tree) return true end,
[TTag.Capture] = function(tree) return needfollow(tree.sib1) end,
[TTag.Seq] = function(tree) return needfollow(tree.sib2) end,
}
register_fname("needfollow", needfollow)
--[[
** ======================================================
** Code generation
** ======================================================
]]--
local Instruction = {}
Instruction.__index = Instruction
function Instruction:new(arg)
local opcode = arg[1]
if opcode == nil then error("no opcode") end
-- target is rule # for open calls before correction, and absolute pc after
local instr = {
code = opcode,
exec = opcode.exec, -- copy the exec function from the opcode!
aux = arg.aux, -- used for the "primary argument"
key = arg.key, -- used for string-valued arguments
target = arg.target, -- used for jmp-like instructions
from = arg.from, -- inclusive start, for ranges
to = arg.to, -- inclusive end, for ranges
set = arg.set, -- charset <= CHARMAX
rest = arg.rest, -- include characters above CHARMAX?
cap = arg.cap, -- used for "capture kind"
}
setmetatable(instr, self)
instr:setCode(opcode) -- opportunity to add tracing logic!
return instr
end
function Instruction:setCode(opcode)
self.code = opcode
local exec = opcode.exec
if TRACE_INSTRUCTIONS then
local str
self.exec = function(self, state, ...)
if str == nil then
str = table.concat(printinst(0, self, { "Executing " })):gsub("\n","")
end
print(state.bytePos, state.codepoint, str)
return exec(self, state, ...)
end
else
self.exec = exec
end
end
-- state for the compiler
function CompileState:new(p)
local cs = {
p = p,
}
setmetatable(cs, self)
return cs
end
function CompileState:withGrammar(g, f, ...)
local lastGrammar = self.grammar
self.grammar = g
local result = compat.pack(f(...))
self.grammar = lastGrammar
return compat.unpack(result)
end
function CompileState:codegen(tree, opt, tt, fl)
assert(fl.tag == TTag.Set)
-- just a little helper
return codegen(tree, self, opt, tt, fl)
end
function CompileState:getinstr(i)
return self.p.code[i]
end
function CompileState:addinstruction(arg)
local code = self.p.code
table.insert(code, Instruction:new(arg))
return #code
end
function CompileState:gethere()
local code = self.p.code
return 1 + #code
end
function CompileState:jumptothere(pc, where)
if pc ~= NOINST then
local code = self.p.code
code[pc].target = where
end
end
function CompileState:jumptohere(pc)
self:jumptothere(pc, self:gethere())
end
function codethrow(cs, throw)
local rule = nil
if cs.grammar ~= nil then
-- we only lookup/match *string* rule names, not numeric indices
rule = cs.grammar.ruletab[tostring(throw.key)]
end
if rule ~= nil then
return cs:addinstruction{
Opcode.ThrowRec,
key=throw.key, -- rule name / error label
target=rule.n -- recovery rule number
}
else
return cs:addinstruction{
Opcode.Throw,
key=throw.key, -- rule name / error label
-- no recovery rule
}
end
end
function codeutfr(cs, tree)
return cs:addinstruction{
Opcode.UTFR,
from = tree.from,
to = tree.to,
}
end
--[[
** Code an IChar instruction, or IAny if there is an equivalent
** test dominating it
]]--
function codechar(cs, codepoint, tt)
if tt ~= NOINST and
cs:getinstr(tt).code == Opcode.TestChar and
cs:getinstr(tt).aux == codepoint then
cs:addinstruction{Opcode.Any}
else
cs:addinstruction{Opcode.Char, aux=codepoint,}
end
end
--[[
** Add a charset posfix to an instruction
]]--
function addcharset(cs, codepoint)
--[[
static void addcharset (CompileState *compst, const byte *cs) {
int p = gethere(compst);
int i;
for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++)
nextinstruction(compst); /* space for buffer */
/* fill buffer with charset */
loopset(j, getinstr(compst, p).buff[j] = cs[j]);
]]--
end
--[[
** code a char set, optimizing unit sets for IChar, "complete"
** sets for IAny, and empty sets for IFail; also use an IAny
** when instruction is dominated by an equivalent test.
]]--
function codecharset(cs, tree, tt)
local op,codepoint = charsettype(tree)
if op == Opcode.Char then
return codechar(cs, codepoint, tt)
elseif op == Opcode.Set then
-- non-trivial set?
if tt ~= NOINST and
cs:getinstr(tt).code == Opcode.TestSet and
cs_equal(tree, cs:getinstr(tt)) then
return cs:addinstruction{Opcode.Any}
else
return cs:addinstruction{
Opcode.Set,
set = tree.set, -- XXX ensure immutable
rest = tree.rest,
}
end
else
return cs:addinstruction{op} -- Any or Fail
end
end
--[[
** code a test set, optimizing unit sets for ITestChar, "complete"
** sets for ITestAny, and empty sets for IJmp (always fails).
** 'e' is nonzero iff test should accept the empty string. (Test
** instructions in the current VM never accept the empty string.)
]]--
function codetestset(cs, tree, e)
if e ~= 0 then return NOINST end
local op,codepoint = charsettype(tree)
if op == Opcode.Fail then
return cs:addinstruction{Opcode.Jmp, target = NOINST} -- always jump
elseif op == Opcode.Any then
return cs:addinstruction{Opcode.TestAny, target = NOINST}
elseif op == Opcode.Char then
return cs:addinstruction{
Opcode.TestChar,
target = NOINST,
aux = codepoint,
}
elseif op == Opcode.Set then
return cs:addinstruction{
Opcode.TestSet,
target = NOINST,
set = tree.set, -- XXX ensure immutable
rest = tree.rest,
}
else
error("unreachable")
end
end
--[[
** <behind(p)> == behind n; <p> (where n = fixedlen(p))
]]--
function codebehind(cs, tree)
if tree.n > 0 then
cs:addinstruction{ Opcode.Behind, aux = tree.n }
end
return cs:codegen(tree.sib1, false, NOINST, fullset)
end
--[[
** Choice; optimizations:
** - when p1 is headfail or when first(p1) and first(p2) are disjoint,
** than a character not in first(p1) cannot go to p1 and a character
** in first(p1) cannot go to p2, either because p1 will accept
** (headfail) or because it is not in first(p2) (disjoint).
** (The second case is not valid if p1 accepts the empty string,
** as then there is no character at all...)
** - when p2 is empty and opt is true; a IPartialCommit can reuse
** the Choice already active in the stack.
]]--
function codechoice(cs, p1, p2, opt, fl)
local emptyp2 = (p2.tag == TTag.True)
local e1, cs1 = cs:getfirst(p1, fullset)
local headfailp1 = cs:headfail(p1)
local e2, cs2
if not headfailp1 and e1 == 0 then
e2, cs2 = cs:getfirst(p2, fl) -- avoid computing unless necessary
end
if headfailp1 or (e1 == 0 and cs_disjoint(cs1, cs2)) then
-- <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2:
local test = codetestset(cs, cs1, 0)
local jmp = NOINST
cs:codegen(p1, false, test, fl)
if not emptyp2 then
jmp = cs:addinstruction{Opcode.Jmp, target = NOINST }
end
cs:jumptohere(test)
cs:codegen(p2, opt, NOINST, fl)
cs:jumptohere(jmp)
elseif opt and emptyp2 then
-- p1? == IPartialCommit; p1
cs:jumptohere(cs:addinstruction{Opcode.PartialCommit, target = NOINST})
cs:codegen(p1, true, NOINST, fullset)
else
-- <p1 / p2> ==
-- test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2:
local test = codetestset(cs, cs1, e1)
local pchoice = cs:addinstruction{Opcode.Choice, target = NOINST}
cs:codegen(p1, emptyp2, test, fullset)
local pcommit = cs:addinstruction{Opcode.Commit, target = NOINST}
cs:jumptohere(pchoice)
cs:jumptohere(test)
cs:codegen(p2, opt, NOINST, fl)
cs:jumptohere(pcommit)
end
end
--[[
** And predicate
** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n
** (valid only when 'p' has no captures)
]]--
function codeand(cs, tree, tt)
--[[ labeled failure: optimization disabled because in case of a failure it
does not report the expected error position (the current subject position
when begin the matching of <&p>) ]]--
local pchoice = cs:addinstruction{Opcode.PredChoice, target = NOINST}
cs:codegen(tree, false, tt, fullset)
local pcommit = cs:addinstruction{Opcode.BackCommit, target = NOINST}
cs:jumptohere(pchoice)
cs:addinstruction{Opcode.Fail}
cs:jumptohere(pcommit)
end
--[[
** Captures: if pattern has fixed (and not too big) length, and it
** has no nested captures, use a single IFullCapture instruction
** after the match; otherwise, enclose the pattern with OpenCapture -
** CloseCapture.
]]--
function codecapture(cs, tree, tt, fl)
local len = cs:fixedlen(tree.sib1)
if len >= 0 and len <= MAXOFF and not cs:hascaptures(tree.sib1) then
cs:codegen(tree.sib1, false, tt, fl)
cs:addinstruction{
Opcode.FullCapture,
cap = tree.cap,
key = tree.key, -- capture name
aux = len,
}
else
assert(tree.cap ~= nil)
cs:addinstruction({
Opcode.OpenCapture,
cap = tree.cap,
key = tree.key, -- capture name
})
cs:codegen(tree.sib1, false, tt, fl)
cs:addinstruction({
Opcode.CloseCapture,
cap = CapKind.close,
})
end
end
function coderuntime(cs, tree, tt)
cs:addinstruction({
Opcode.OpenCapture,
cap = CapKind.group,
key = tree.key, -- capture *function*
})
cs:codegen(tree.sib1, false, tt, fullset)
cs:addinstruction({
Opcode.CloseRunTime,
cap = CapKind.close,
})
end
--[[
** Repetition; optimizations:
** When pattern is a charset, can use special instruction ISpan.
** When pattern is head fail, or if it starts with characters that
** are disjoint from what follows the repetions, a simple test
** is enough (a fail inside the repetition would backtrack to fail
** again in the following pattern, so there is no need for a choice).
** When 'opt' is true, the repetion can reuse the Choice already
** active in the stack.
]]--
function coderep(cs, tree, opt, fl)
local st = tocharset(tree)
if st ~= nil then
return cs:addinstruction{
Opcode.Span,
set = st.set,
rest = st.rest,
}
end
local e1,st = cs:getfirst(tree, fullset)
if cs:headfail(tree) or (e1 == 0 and cs_disjoint(st, fl)) then
-- L1: test (fail(p1)) -> L2; <p>; jmp L1; L2:
local test = codetestset(cs, st, 0)
cs:codegen(tree, false, test, fullset)
local jmp = cs:addinstruction{Opcode.Jmp, target = NOINST}
cs:jumptohere(test)
cs:jumptothere(jmp, test)
else
-- test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2:
-- or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1;
local test = codetestset(cs, st, e1)
local pchoice = NOINST
if opt then
cs:jumptohere(cs:addinstruction{Opcode.PartialCommit, target = NOINST})
else
pchoice = cs:addinstruction{Opcode.Choice, target = NOINST}
end
local l2 = cs:gethere()
cs:codegen(tree, false, NOINST, fullset)
local commit = cs:addinstruction{Opcode.PartialCommit, target = NOINST}
cs:jumptothere(commit, l2)
cs:jumptohere(pchoice)
cs:jumptohere(test)
end
end
--[[
** Not predicate; optimizations:
** In any case, if first test fails, 'not' succeeds, so it can jump to
** the end. If pattern is headfail, that is all (it cannot fail
** in other parts); this case includes 'not' of simple sets. Otherwise,
** use the default code (a choice plus a failtwice).
]]--
function codenot(cs, tree)
local e,st = cs:getfirst(tree, fullset)
local test = codetestset(cs, st, e)
if cs:headfail(tree) then
-- test (fail(p1)) -> L1; fail; L1:
cs:addinstruction{Opcode.Fail}
else
-- test(fail(p))-> L1; choice L1; <p>; failtwice; L1:
local pchoice = cs:addinstruction{Opcode.PredChoice, target = NOINST }
cs:codegen(tree, false, NOINST, fullset)
cs:addinstruction{Opcode.FailTwice}
cs:jumptohere(pchoice)
end
cs:jumptohere(test)
end
-- find the final destination of a sequence of jumps
function finaltarget(code, pc)
while code[pc].code == Opcode.Jmp do
pc = code[pc].target
end
return pc
end
-- final label (after traversing any jumps)
function finallabel(code, pc)
return finaltarget(code, code[pc].target)
end
--[[
** change open calls to calls, using list 'positions' to find
** correct offsets; also optimize tail calls
]]--
function correctcalls(cs, positions, from, to)
local code = cs.p.code
for i=from,(to-1) do
local op = code[i]
if op.code == Opcode.OpenCall or op.code == Opcode.ThrowRec then
local n = op.target -- rule number
local rule = positions[n] -- rule position
if rule == from or code[rule - 1].code == Opcode.Ret then
-- sanity check! ok!
else
error("bad rule position")
end
if op.code == Opcode.OpenCall then
if code[finaltarget(code, i+1)].code == Opcode.Ret then
-- call; ret => tail call
op:setCode(Opcode.Jmp)
else
op:setCode(Opcode.Call) -- open call no more
end
end
op.target = rule
end
end
end
--[[
** Code for a grammar:
** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2:
]]--
function codegrammar(cs, tree)
local firstcall = cs:addinstruction{Opcode.Call, target = NOINST} -- call initial rule
local jumptoend = cs:addinstruction{Opcode.Jmp, target = NOINST} -- jump to the end
local start = cs:gethere() -- here starts the initial rule
cs:jumptohere(firstcall)
local positions = {}
local rule = tree.sib1
for i=1,tree.n do
local pattern = rule.sib1
positions[i] = cs:gethere() -- save rule position
cs:codegen(rule.sib1, false, NOINST, fullset) -- code rule
cs:addinstruction{Opcode.Ret}
rule = rule.sib2
end
if rule.tag ~= TTag.True then error("impossible") end
cs:jumptohere(jumptoend)
correctcalls(cs, positions, start, cs:gethere())
end
function codecall(cs, tree)
local rule = cs.grammar.ruletab[tree.key]
assert(rule ~= nil)
assert(rule.n ~= nil)
return cs:addinstruction{
Opcode.OpenCall, -- to be corrected later
target = rule.n -- rule number
}
end
--[[
** Code first child of a sequence
** (second child is called in-place to allow tail call)
** Return 'tt' for second child
]]--
function codeseq1(cs, p1, p2, tt, fl)
assert(fl.tag == TTag.Set)
if needfollow(p1) then
local _, fl1 = cs:getfirst(p2, fl) -- p1 follow is p2 first
cs:codegen(p1, false, tt, fl1)
else
-- use fullset as follow
cs:codegen(p1, false, tt, fullset)
end
if cs:fixedlen(p1) ~= 0 then -- can 'p1' consume anything?
return NOINST -- invalidate test
else
return tt -- else 'tt' still protects sib2
end
end
--[[
** Main code-generation function: dispatch to auxiliar functions
** according to kind of tree. ('needfollow' should return true
** only for consructions that use 'fl'.)
]]--
--[[
** code generation is recursive; 'opt' indicates that the code is being
** generated as the last thing inside an optional pattern (so, if that
** code is optional too, it can reuse the 'IChoice' already in place for
** the outer pattern). 'tt' points to a previous test protecting this
** code (or NOINST). 'fl' is the follow set of the pattern.
]]--
codegen = define_tree_visitor{
[TTag.Char] = function(tree, cs, opt, tt, fl)
return codechar(cs, tree.n, tt)
end,
[TTag.Any] = function(tree, cs, opt, tt, fl)
return cs:addinstruction{Opcode.Any}
end,
[TTag.Set] = function(tree, cs, opt, tt, fl)
return codecharset(cs, tree, tt)
end,
[TTag.True] = function(tree, cs, opt, tt, fl)
return -- do nothing
end,
[TTag.False] = function(tree, cs, opt, tt, fl)
return cs:addinstruction{Opcode.Fail}
end,
[TTag.UTFR] = function(tree, cs, opt, tt, fl)
return codeutfr(cs, tree)
end,
[TTag.Choice] = function(tree, cs, opt, tt, fl)
return codechoice(cs, tree.sib1, tree.sib2, opt, fl)
end,
[TTag.Rep] = function(tree, cs, opt, tt, fl)
return coderep(cs, tree.sib1, opt, fl)
end,
[TTag.Behind] = function(tree, cs, opt, tt, fl)
return codebehind(cs, tree)
end,
[TTag.Not] = function(tree, cs, opt, tt, fl)
return codenot(cs, tree.sib1)
end,
[TTag.And] = function(tree, cs, opt, tt, fl)
return codeand(cs, tree.sib1, tt)
end,
[TTag.Capture] = function(tree, cs, opt, tt, fl)
return codecapture(cs, tree, tt, fl)
end,
[TTag.RunTime] = function(tree, cs, opt, tt, fl)
return coderuntime(cs, tree, tt)
end,
[TTag.Grammar] = function(tree, cs, opt, tt, fl)
return cs:withGrammar(tree, codegrammar, cs, tree)
end,
[TTag.Call] = function(tree, cs, opt, tt, fl)
return codecall(cs, tree)
end,
[TTag.Seq] = function(tree, cs, opt, tt, fl)
tt = codeseq1(cs, tree.sib1, tree.sib2, tt, fl) -- code 'p1'
return cs:codegen(tree.sib2, opt, tt, fl) -- tail call
end,
[TTag.Throw] = function(tree, cs, opt, tt, fl)
return codethrow(cs, tree)
end,
["assert"] = function(tree, cs, opt, tt, fl)
assert(fl.tag == TTag.Set)
assert(opt ~= 0)
end,
}
register_fname("codegen", codegen)
--[[
** Optimize jumps and other jump-like instructions.
** * Update labels of instructions with labels to their final
** destinations (e.g., choice L1; ... L1: jmp L2: becomes
** choice L2)
** * Jumps to other instructions that do jumps become those
** instructions (e.g., jump to return becomes a return; jump
** to commit becomes a commit)
]]--
function peephole(cs)
local code = cs.p.code
local jmpswitch
local switch = define_opcode_visitor{
-- instructions with labels
[{Opcode.Choice, Opcode.Call, Opcode.Commit, Opcode.PartialCommit,
Opcode.BackCommit, Opcode.TestChar, Opcode.TestSet,
Opcode.TestAny}] = function(op, i)
cs:jumptothere(i, finallabel(code, i))
end,
[Opcode.Jmp] = function(op, i)
local ft = finaltarget(code, i)
jmpswitch(code[ft], i, ft) -- jumping to what?
end,
default = function() end,
}
jmpswitch = define_opcode_visitor{
-- instructions with unconditional implicit jumps
[{Opcode.Ret,Opcode.Fail,Opcode.FailTwice,Opcode.End}] = function(op, i, ft)
code[i]:setCode(code[ft].code) -- jump becomes that instruction
end,
-- instructions with unconditional explicit jumps
[{Opcode.Commit, Opcode.PartialCommit, Opcode.BackCommit}] = function(op, i, ft)
local fft = finallabel(code, ft)
code[i]:setCode(code[ft].code) -- jump becomes that instruction
cs:jumptothere(i, fft) -- with an optimized target
switch(code[i], i) -- reoptimize the label
end,
default = function(op, i, ft)
cs:jumptothere(i, ft) -- optimize label
end,
}
for i=1,#code do
switch(code[i], i)
end
end
-- thread the instructions to speed up dispatch during execution
function thread(cs)
local code = cs.p.code
for i=1,#code-1 do
code[i].next = code[i+1]
if code[i].target ~= nil then
code[i].branch = code[code[i].target]
end
end
end
function compile(p)
local compst = CompileState:new(p)
p.code = {}
assert(fullset.tag == TTag.Set)
compst:codegen(p, false, NOINST, fullset)
compst:addinstruction{Opcode.End}
peephole(compst)
thread(compst)
return p.code
end
return {
Instruction = Instruction,
compile = compile,
cs_clone = cs_clone,
cs_complement = cs_complement,
cs_diff = cs_diff,
cs_intersection = cs_intersection,
cs_union = cs_union,
fixedlen = fixedlen,
hascaptures = hascaptures,
nofail = nofail,
nullable = nullable,
nullable_with_grammar = nullable_with_grammar,
tocharset = tocharset,
}
end)
register('llpeg.utf8util', function(myrequire)
myrequire('strict')
local utf8util = {}
function utf8util.codepointAt(s, pos)
local c1 = string.byte(s, pos)
if c1 <= 0x7F then
return c1, 1
end
local c2 = string.byte(s, pos + 1)
if c1 <= 0xDF then
return ((c1 % 0x20 ) * 0x40) + (c2 % 0x40), 2
end
local c3 = string.byte(s, pos + 2)
if c1 <= 0xEF then
return (((c1 % 0x10) * 0x40) + (c2 % 0x40)) * 0x40 + (c3 % 0x40), 3
end
local c4 = string.byte(s, pos + 3)
if c1 <= 0xF7 then
return ((((c1 % 0x08) * 0x40) + (c2 % 0x40)) * 0x40 + (c3 % 0x40)) * 0x40 + (c4 % 0x40), 4
end
error( "bad utf8" )
end
-- same as utf8.offset in Lua 5.3 standard library
function utf8util.offset(s, n, i)
if n > 0 then error("unimplemented") end
while n < 0 do
i = i - 1
if i < 1 then return nil end
local c = string.byte(s, i)
if c < 0x80 or c > 0xBF then
n = n + 1
end
end
return i
end
return utf8util
end)
register('llpeg.list', function(myrequire)
local List = {}
List.__index = List
function List:new()
return setmetatable({ n = 0 }, self)
end
function List:__len()
return self.n
end
function List:push(val)
local n = self.n + 1
self[n] = val
self.n = n
end
function List:pop()
local n = self.n
assert(n > 0)
local old = self[n]
self[n] = nil
self.n = n - 1
return old
end
function List:insert(pos, val)
for i=self.n,pos,-1 do
self[i+1] = self[i]
end
self[pos] = val
self.n = self.n + 1
end
return List
end)
register('llpeg.cap', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local from = myrequire('llpeg.types').from
local
CapKind,
_ = from(myrequire('llpeg.types'), {
'CapKind',
})
local
printcaplist,
_ = from(myrequire('llpeg.print'), {
'printcaplist',
})
local List = myrequire('llpeg.list')
local MAXRECLEVEL = 200
local Capture = {}
Capture.__index = Capture
-- kind is CapKind of the capture
-- bytePos is the subject position (in bytes)
-- byteLen is the length of the capture (in bytes)
-- extra is extra info (group name, arg index, etc)
function Capture:new(kind, bytePos, byteLen, extra)
assert(getmetatable(kind) == CapKind)
return setmetatable({
kind = kind, bytePos = bytePos, byteLen = byteLen, extra = extra,
}, self)
end
function Capture:__tostring()
return string.format("Capture{kind=%s, pos=%d, len=%s, extra=%s}",
self.kind, self.bytePos, self.byteLen, self.extra)
end
function Capture:isopencap()
return self.byteLen == nil
end
-- true if c2 is (any number of levels) inside self
function Capture:inside(c2)
if self:isopencap() then
return not c2:isclosecap()
else
return c2.bytePos < (self.bytePos + self.byteLen)
end
end
function Capture:isclosecap()
return self.kind == CapKind.close
end
--[[
** Return the size of capture 'cap'. If it is an open capture, 'close'
** must be its corresponding close.
]]--
function Capture:size(close)
if self:isopencap() then
assert(close:isclosecap())
return close.bytePos - self.bytePos
else
return self.byteLen
end
end
function CapKind:newCapture(bytePos, byteLen, extra)
return Capture:new(self, bytePos, byteLen, extra)
end
local CapState = {}
CapState.__index = CapState
-- Capture cap: current capture
-- Capture ocap: (original) capture list
-- int ptop: index of last argument to 'match'
-- string s: original string
-- int valuecached: value stored in cache slot
-- int reclevel: recursion level
function CapState:new(captures, source, extraArgs)
return setmetatable({
captures = captures,
index = 1,
source = source,
valuecached = {},
reclevel = 0,
extraArgs = extraArgs,
}, self)
end
function CapState:cap() -- helper
return self.captures[self.index]
end
function CapState:advance() -- helper
local i = self.index
local cap = self.captures[i]
self.index = i + 1
return cap, i
end
function CapState:substr(start, len) -- helper
return string.sub(self.source, start, start + len - 1)
end
function CapState:skipclose(head)
if head:isopencap() then
assert(self.captures[self.index]:isclosecap())
self.index = self.index + 1
end
end
function CapState:closesize(head)
return head:size(self:cap())
end
--[[
** Go to the next capture at the same level
]]--
function CapState:nextcap()
local cap = self:cap()
if cap:isopencap() then -- must look for a close
local n = 0 -- number of opens waiting a close
while true do -- look for corresponding close
self.index = self.index + 1
cap = self:cap()
if cap:isopencap() then
n = n + 1
elseif cap:isclosecap() then
if n == 0 then break end
n = n - 1
end
end
self.index = self.index + 1 -- skip last close (or entire single capture)
else
self.index = self.index + 1
while cap:inside(self:cap()) do
self.index = self.index + 1 -- skip captures inside the current one
end
end
end
--[[
** Goes back in a list of captures looking for an open capture
** corresponding to a close
]]--
function CapState:findopen(i) -- captures[i] is the close that we want to match
assert(self.captures[i]:isclosecap())
local n = 0 -- number of closes waiting an open
while i > 1 do
i = i - 1
local cap = self.captures[i]
if cap:isclosecap() then
n = n + 1 -- one more open to skip
elseif cap:isopencap() then
if n == 0 then return cap,i end
n = n - 1
end
end
error("couldn't find open")
end
--[[
** Checks whether group 'grp' is visible to 'ref', that is, 'grp' is
** not nested inside a full capture that does not contain 'ref'. (We
** only need to care for full captures because the search at 'findback'
** skips open-end blocks; so, if 'grp' is nested in a non-full capture,
** 'ref' is also inside it.) To check this, we search backward for the
** inner full capture enclosing 'grp'. A full capture cannot contain
** non-full captures, so a close capture means we cannot be inside a
** full capture anymore.
]]--
function CapState:capvisible(igrp, ref)
local i = igrp
local grp = self.captures[igrp]
while i > 1 do
i = i - 1
local cap = self.captures[i]
if cap:isclosecap() then
return true -- can stop the search
elseif cap:inside(grp) then -- is 'grp' inside cap?
return cap:inside(ref) -- ok iff cap also contains ref
end
end
return true -- 'grp' is not inside any capture
end
--[[
** Try to find a named group capture with the name given;
** goes backward from 'i'.
]]--
function CapState:findback(name, i)
if i == nil then i = self.index end
local ref = self.captures[i]
while i > 1 do
i = i - 1
local cap = self.captures[i]
if cap:isclosecap() or not cap:inside(ref) then
if cap:isclosecap() then
cap,i = self:findopen(i)
end
if cap.kind == CapKind.group and self:capvisible(i, ref) then
if cap.extra == name then
return cap,i
end
end
end
end
error("back reference '"..name.."' not found")
end
function CapState:getcaptures()
local result = List:new()
while not self:cap():isclosecap() do
self:pushcapture(result)
end
return result
end
function CapState:pushcapture(result)
self.reclevel = self.reclevel + 1
if self.reclevel > MAXRECLEVEL then
error("subcapture nesting too deep")
end
local cap = self.captures[self.index]
assert(cap.kind.push ~= nil)
local res = cap.kind.push(self, cap, result)
self.reclevel = self.reclevel - 1
return res
end
-- helper functions for pushcapture
--[[
** Push on the Lua stack all values generated by nested captures inside
** the current capture. Returns number of values pushed. 'addextra'
** makes it push the entire match after all captured values. The
** entire match is pushed also if there are no other nested values,
** so the function never returns zero.
]]--
function CapState:pushnestedvalues(result, addextra)
local head = self:advance()
local n = 0 -- number of pushed subvalues
-- repeat for all nested patterns
while head:inside(self:cap()) do
n = n + self:pushcapture(result)
end
if addextra or n == 0 then -- need extra?
result:push(self:substr(head.bytePos, self:closesize(head)))
n = n + 1
end
self:skipclose(head)
return n
end
--[[
** Push only the first value generated by nested captures
]]--
function CapState:pushonenestedvalue(result)
local n = self:pushnestedvalues(result, false)
if n == 0 then
result:push(nil) -- ensure there's exactly one value
return 1
end
while n > 1 do
result:pop() -- pop extra values
n = n - 1
end
return n
end
-- visitor patterns for pushcapture
function CapKind.position.push(capstate, cap, result)
result:push(cap.bytePos)
capstate.index = capstate.index + 1
return 1
end
function CapKind.const.push(capstate, cap, result)
result:push(cap.extra)
capstate.index = capstate.index + 1
return 1
end
function CapKind.arg.push(capstate, cap, result)
local n = cap.extra
if n > capstate.extraArgs.n then
error(string.format("reference to absent extra argument #%d", n))
end
result:push(capstate.extraArgs[n])
capstate.index = capstate.index + 1
return 1
end
function CapKind.simple.push(capstate, cap, result)
local k = capstate:pushnestedvalues(result, true)
-- reorder so that the whole match is the first result, not the last
local last = result:pop()
result:insert(2 + #result - k, last)
return k
end
-- missing a bunch
--[[
** Table capture: creates a new table and populates it with nested
** captures.
]]--
function CapKind.table.push(capstate, cap, result) -- aka tablecap
local t = {}
result:push(t)
local head = capstate:advance()
local n = 0
while head:inside(capstate:cap()) do
cap = capstate:cap()
if cap.kind == CapKind.group and cap.extra ~= nil then -- named group?
capstate:pushonenestedvalue(result)
t[cap.extra] = result:pop() -- move it into table
else -- not a named group
local k = capstate:pushcapture(result)
for i=k,1,-1 do
t[n + i] = result:pop() -- move it into table (indexed)
end
n = n + k
end
end
capstate:skipclose(head)
return 1 -- number of values pushed (only the table)
end
--[[
** Table-query capture
]]--
function CapKind.query.push(capstate, cap, result) -- aka querycap
capstate:pushonenestedvalue(result)
local key = result:pop()
local tbl = cap.extra
assert(type(tbl) == "table")
local val = tbl[key]
if val ~= nil then
result:push(val)
return 1
else
return 0
end
end
--[[
** Fold capture
]]--
function CapKind.fold.push(capstate, cap, result) -- aka foldcap
local f = cap.extra
assert(type(f) == "function")
local head = capstate:advance()
if capstate:cap():isclosecap() then
-- no nested captures? (large subject)
error("no initial value for fold capture")
end
local args = List:new()
local n = capstate:pushcapture(args)
if n == 0 then
-- nested captures with no values?
error("no initial value for fold capture")
end
local accum = args[1] -- leave only one result for accumulator
while head:inside(capstate:cap()) do
args = List:new()
args:push( accum ) -- put accumulator first
n = capstate:pushcapture(args) -- get next capture's values
accum = f(compat.unpack(args))
end
capstate:skipclose(head)
-- only accumulator left in result
result:push(accum)
return 1
end
--[[
** Function capture
]]--
CapKind["function"].push = function(capstate, cap, result)
local f = cap.extra
assert(type(f) == "function")
local args = List:new()
local n = capstate:pushnestedvalues(args, false)
local r = compat.pack(f(compat.unpack(args)))
for i=1,r.n do
result:push(r[i])
end
return r.n
end
--[[
** Accumulator capture
]]--
function CapKind.acc.push(capstate, cap, result) -- aka accumulatorcap
if #result == 0 then
error("no previous value for accumulator capture")
end
local f = cap.extra
assert(type(f) == "function")
local prev = #result
local args = List:new()
args:push(result[prev])
local n = capstate:pushnestedvalues(args, false)
result[prev] = f(compat.unpack(args))
return 0 -- did not add any extra value
end
--[[
** Select capture
]]--
function CapKind.num.push(capstate, cap, result) -- aka numcap
local idx = cap.extra -- value to select
if idx == 0 then -- no values?
capstate:nextcap() -- skip entire capture
return 0 -- no value produced
else
local vals = List:new()
local n = capstate:pushnestedvalues(vals, false)
if n < idx then -- invalid index?
error("no capture '"..idx.."'")
else
result:push(vals[idx])
return 1
end
end
end
function CapState:runtimecap(closePos)
local close = self.captures[closePos]
local open,openPos = self:findopen(closePos) -- get open group capture
assert(open.kind == CapKind.group)
self.index = openPos -- set state to the open capture
local args = List:new()
args:push( self.source) -- original subject
args:push( close.bytePos ) -- current position
local n = self:pushnestedvalues(args, false) -- push nested captures
local func = open.extra
local funcRet = compat.pack(func(compat.unpack(args)))
local res = closePos - openPos -- number of captures to be removed
return res, funcRet
end
function CapKind.runtime.push(capstate, cap, result) -- aka runtimecap
result:push(cap.extra)
capstate.index = capstate.index + 1
return 1
end
local MAXSTRCAPS = 10
--[[
** Collect values from current capture into array 'cps'. Current
** capture must be Cstring (first call) or Csimple (recursive calls).
** (In first call, fills %0 with whole match for Cstring.)
** Returns number of elements in the array that were filled.
]]--
function CapState:getstrcaps(cps, n)
local k = n
n = n + 1
cps[k] = {
isstring = true, -- get string value
bytePos = self:cap().bytePos, -- starts here
}
local head = self:advance()
while head:inside(self:cap()) do
if n > MAXSTRCAPS then -- too many captures?
self:nextcap() -- skip extra captures (will not need them)
elseif self:cap().kind == CapKind.Simple then -- string?
n = self:getstrcaps(cps, n) -- put info into array
else
cps[n] = {
isstring = false, -- not a string
cap = self.index, -- keep original capture
}
self:nextcap()
n = n + 1
end
end
cps[k].endPos = head.bytePos + self:closesize(head)
self:skipclose(head)
return n
end
function CapState:addonestring(buffer, what)
local cap = self:cap()
if cap.kind == CapKind.string then
-- add capture directly to buffer
return stringcap(self, buffer)
elseif cap.kind == CapKind.subst then
-- add capture directly to buffer
return substcap(self, buffer)
elseif cap.kind == CapKind.acc then
error("invalid context for an accumulator capture")
end
local result = List:new()
local n = self:pushcapture(result)
if n == 0 then return 0 end -- no values to add
local val = result[1] -- take only one result (the first)
if type(val) == "number" then
val = tostring(val)
elseif type(val) ~= "string" then
error("invalid "..what.." value (a "..type(val)..")")
end
table.insert(buffer, val)
return 1
end
--[[
** String capture: add result to buffer 'b' (instead of pushing
** it into the stack)
]]--
function stringcap(capstate, buffer)
local fmt = capstate:cap().extra
local cps = {}
local n = capstate:getstrcaps(cps, 1) - 1 -- collect nested captures
local sawEscape = false
for _,c in compat.utf8codes(fmt) do
if sawEscape then
sawEscape = false
if c < 48 or c > 57 then -- not followed by a digit
table.insert(buffer, compat.utf8char(c))
else
local l = 1 + c - 48 -- capture index (1-based)
if l > n then
error("invalid capture index ("..(l-1)..")")
elseif cps[l].isstring then
table.insert(buffer, capstate:substr(cps[l].bytePos, cps[l].endPos - cps[l].bytePos))
else
-- go back to evaluate that nested capture
local curr = capstate.index
capstate.index = cps[l].cap
if capstate:addonestring(buffer, "capture") == 0 then
error("no values in capture index "..l)
end
capstate.index = curr
end
end
elseif c ~= 37 then -- not a % escape?
table.insert(buffer, compat.utf8char(c))
else
sawEscape = true
end
end
return 1
end
--[[
** Substitution capture: add result to buffer 'b'
]]--
function substcap(capstate, buffer)
local head = capstate:advance()
local curr = head.bytePos
while head:inside(capstate:cap()) do
local cap = capstate:cap()
local nextPos = cap.bytePos
local s = capstate:substr(curr, nextPos - curr)
table.insert(buffer, s) -- add text up to capture
if capstate:addonestring(buffer, "replacement") == 0 then
-- no capture value, keep original text in final result
curr = nextPos
else
-- continue after match
local lastCap = capstate.captures[capstate.index - 1]
curr = nextPos + cap:size(lastCap)
end
end
-- add last piece of text
local s = capstate:substr(curr, head.bytePos + capstate:closesize(head) - curr)
table.insert(buffer, s)
capstate:skipclose(head)
end
function CapKind.subst.push(capstate, cap, result) -- aka substcap
local buffer = {}
substcap(capstate, buffer)
result:push(table.concat(buffer))
return 1
end
function CapKind.string.push(capstate, cap, result) -- aka stringcap
local buffer = {}
stringcap(capstate, buffer)
result:push(table.concat(buffer))
return 1
end
function CapKind.group.push(capstate, cap, result)
if cap.extra == nil then -- anonymous group?
return capstate:pushnestedvalues(result, false) -- add all nested values
else -- named group: add no values
capstate:nextcap()
return 0
end
end
--[[
** Back-reference capture. Return number of values pushed.
]]--
function CapKind.backref.push(capstate, cap, result)
local curr = capstate.index
local _,i = capstate:findback(cap.extra)
capstate.index = i
local n = capstate:pushnestedvalues(result, false)
capstate.index = curr + 1
return n
end
return {
CapState = CapState,
Capture = Capture,
}
end)
register('llpeg.vm', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local utf8util = myrequire('llpeg.utf8util')
local from = myrequire('llpeg.types').from
local
CHARMAX,
CapKind,
Opcode,
enum,
_ = from(myrequire('llpeg.types'), {
'CHARMAX',
'CapKind',
'Opcode',
'enum',
})
local
CapState,
Capture,
__ = from(myrequire('llpeg.cap'), {
'CapState',
'Capture',
})
local
Instruction,
___ = from(myrequire('llpeg.code'), {
'Instruction',
})
local
printcaplist,
___ = from(myrequire('llpeg.print'), {
'printcaplist',
})
local LFAIL = {}
local InsidePred = enum{
OUTPRED = 0,
INPRED = 1,
}
local Stack = {}
Stack.__index = Stack
-- Stack prev: previous entry in the stack
-- int bytePos: saved position, or NULL for calls
-- Instruction pc: saved instruction
-- int caplevel
-- int labenv -- for labeled failure
-- bool predchoice -- for labeled failure
function Stack:new(prev, bytePos, pc, caplevel, labenv, predchoice)
return setmetatable({
prev = prev,
bytePos = bytePos, pc = pc, caplevel = caplevel,
labenv = labenv, predchoice = predchoice,
}, self)
end
function Stack:__tostring()
return string.format(
"Stack{ bytePos=%d, caplevel=%d, labenv=%s, predchoice=%s }",
self.bytePos, self.caplevel, self.labenv, self.predchoice
)
end
function Stack:print()
local s = self
while s ~= nil do
print("Stack", s)
s = s.prev
end
end
local MatchResult = {}
MatchResult.__index = MatchResult
function MatchResult:new()
return setmetatable({
labelf = nil, -- failure label
sfail = -1, -- farthest failure
}, self)
end
local State = {}
State.__index = State
function State:new(source, bytePos, ...)
local giveup = Instruction:new{Opcode.Giveup}
local insidepred = InsidePred.OUTPRED -- label environment is off inside predicates
local stack = Stack:new(nil, bytePos, giveup, 0, insidepred, nil)
local cp,cpLen
if bytePos <= #source then
cp, cpLen = utf8util.codepointAt(source, bytePos)
else
cp, cpLen = nil, nil
end
assert(bytePos ~= nil)
return setmetatable({
source = source, -- the source string
bytePos = bytePos, -- current position in the string, in bytes
codepoint = cp, -- the codepoint at 'bytePos' in 'source'
codepointLen = cpLen, -- the length of the codepoint at 'bytePos'
stack = stack, -- top of stack
captures = {}, -- list of captures
captop = 1, -- point to first empty slot in captures (1-based)
extraArgs = compat.pack(...),
-- labeled failures:
insidepred = insidepred,
labelf = nil, -- failure label
sfail = -1, -- farthest failure
}, self)
end
function State:advance()
return self:resetPosTo(self.bytePos + self.codepointLen)
end
function State:resetPosTo(newPos)
assert(newPos ~= nil)
self.bytePos = newPos
local source = self.source
if newPos <= #source then
local cp, cpLen = utf8util.codepointAt(source, newPos)
self.codepoint = cp
self.codepointLen = cpLen
return cp
else
self.codepoint = nil
self.codepointLen = nil
return nil
end
end
function State:backtrack(n)
local off = utf8util.offset(self.source, -n, self.bytePos)
if off == nil then return false end -- can't backtrack that far!
self:resetPosTo(off)
return true
end
function State:updatefarthest(label)
self.labelf = label
if self.bytePos > self.sfail then
self.sfail = self.bytePos
end
end
function State:pushcapture(cap)
self.captures[self.captop] = cap
self.captop = self.captop + 1
end
function State:fail()
-- pattern failed, try to backtrack
local lastStack
repeat
lastStack = self.stack
self.stack = lastStack.prev
until lastStack.bytePos ~= nil
self:resetPosTo(lastStack.bytePos)
self.captop = lastStack.caplevel
self.insidepred = lastStack.labenv -- labeled failure
return lastStack.pc:exec(self)
end
function State:giveup()
local r = nil
local lab = "fail"
local errpos = self.sfail
if self.labelf ~= nil and self.labelf ~= LFAIL then
lab = self.labelf
end
return r, lab, errpos
end
function State:getcaptures()
local results = {}
if self.captures[1].kind == CapKind.close then -- are there any captures?
return results -- no captures
end
return CapState:new(self.captures, self.source, self.extraArgs):getcaptures()
end
function Opcode.End:exec(state)
state:pushcapture(CapKind.close:newCapture(state.bytePos, 0))
-- trim table to proper length
while #state.captures > state.captop - 1 do
table.remove(state.captures)
end
-- printcaplist(state.captures, #state.captures) -- for debugging
local results = state:getcaptures()
if #results == 0 then -- no captured values?
return state.bytePos -- return only end position
else
return compat.unpack(results)
end
end
function Opcode.Giveup:exec(state)
return state:giveup()
end
function Opcode.Ret:exec(state)
local lastStack = state.stack
state.stack = lastStack.prev
return lastStack.pc:exec(state)
end
function Opcode.Any:exec(state)
if state.codepoint ~= nil then
state:advance()
return self.next:exec(state)
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.TestAny:exec(state)
if state.codepoint ~= nil then
return self.next:exec(state)
else
return self.branch:exec(state)
end
end
function Opcode.UTFR:exec(state)
local cp = state.codepoint
if cp ~= nil and self.from <= cp and cp <= self.to then
state:advance()
return self.next:exec(state)
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.Char:exec(state)
if state.codepoint == self.aux then
state:advance()
return self.next:exec(state)
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.TestChar:exec(state)
if state.codepoint == self.aux then
return self.next:exec(state)
else
return self.branch:exec(state)
end
end
function Opcode.Set:exec(state)
local cp = state.codepoint
if cp ~= nil then
if cp <= CHARMAX then
if self.set[cp] then
state:advance()
return self.next:exec(state)
end
else
if self.rest then
state:advance()
return self.next:exec(state)
end
end
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.TestSet:exec(state)
local cp = state.codepoint
if cp ~= nil then
if cp <= CHARMAX then
if self.set[cp] then
return self.next:exec(state)
end
elseif self.rest then
return self.next:exec(state)
end
end
return self.branch:exec(state)
end
function Opcode.Behind:exec(state)
local n = self.aux
-- XXX SLOW self.aux is in *characters* not *bytes*
if state:backtrack(n) then
return self.next:exec(state)
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.Span:exec(state)
local cp = state.codepoint
while true do
local match = false
if cp ~= nil then
if cp <= CHARMAX then
if self.set[cp] then match = true end
else
if self.rest then match = true end
end
end
if not match then break end
cp = state:advance()
end
return self.next:exec(state)
end
function Opcode.Jmp:exec(state)
return self.branch:exec(state)
end
function Opcode.Choice:exec(state)
state.stack = Stack:new(
state.stack, state.bytePos, self.branch, state.captop, state.insidepred
)
return self.next:exec(state)
end
function Opcode.PredChoice:exec(state)
state.stack = Stack:new(
state.stack, state.bytePos, self.branch, state.captop, state.insidepred,
true -- predchoice
)
state.insidepred = InsidePred.INPRED
return self.next:exec(state)
end
function Opcode.Call:exec(state)
state.stack = Stack:new(
state.stack, nil, self.next
)
return self.branch:exec(state)
end
function Opcode.Commit:exec(state)
state.stack = state.stack.prev
return self.branch:exec(state)
end
function Opcode.PartialCommit:exec(state)
local stack = state.stack
stack.bytePos = state.bytePos
stack.caplevel = state.captop
return self.branch:exec(state)
end
function Opcode.BackCommit:exec(state)
local stack = state.stack
state.stack = stack.prev -- pop the stack
state:resetPosTo(stack.bytePos) -- but reset the position to that stored
state.insidepred = stack.labenv -- labeled failure
state.captop = stack.caplevel
return self.branch:exec(state)
end
function Opcode.Throw:exec(state)
if state.insidepred == InsidePred.OUTPRED then
state.labelf = self.key
-- pop entire stack
while state.stack.prev ~= nil do
state.stack = state.stack.prev
end
else
state.labelf = LFAIL
-- pop until you read a 'predchoice' state
while not state.stack.predchoice do
state.stack = state.stack.prev
end
end
state.sfail = state.bytePos
return state:fail()
end
function Opcode.ThrowRec:exec(state) -- with recovery
state.sfail = state.bytePos
if state.insidepred == InsidePred.OUTPRED then
state.labelf = self.key
state.stack = Stack:new(
state.stack, nil, self.next, state.captop
)
return self.branch:exec(state)
else
state.labelf = LFAIL
-- pop until you read a 'predchoice' state
while not state.stack.predchoice do
state.stack = state.stack.prev
end
return state:fail()
end
end
function Opcode.FailTwice:exec(state)
state.stack = state.stack.prev
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.Fail:exec(state)
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.CloseRunTime:exec(state)
-- close the group
state:pushcapture(self.cap:newCapture(state.bytePos, 0))
-- trim table to proper length
while #state.captures > state.captop - 1 do
table.remove(state.captures)
end
local cs = CapState:new(state.captures, state.source, state.extraArgs)
local n, funcRet = cs:runtimecap(state.captop - 1)
state.captop = state.captop - n -- remove nested captures
-- resdyncaptures: resolve returned values in `funcRet`
-- first argument false=fail, true=keep current pos, number=next position
local firstArg = funcRet[1]
if funcRet.n == 0 then
firstArg = false -- returning void means we'll fail
end
if not firstArg then -- if it is falsey, discard rest of returned vals & fail
state:updatefarthest(LFAIL)
return state:fail() -- tail call
elseif type(firstArg) == "boolean" then
-- keep current position, nothing needs to be done
else
local npos = tonumber(firstArg)
if npos < state.bytePos or npos > (1 + #(state.source)) then
error("invalid position returned by match-time capture")
end
state:resetPosTo(npos)
end
-- push the rest of the funcRet values as new captures
local n = funcRet.n - 1 -- number of new captures
if n == 0 then -- no new captures?
state.captop = state.captop - 1 -- remove open group
else
-- new captures, keep original open group
-- add new captures + close group to 'capture' list
-- adddyncaptures:
assert(state.captures[state.captop - 1].kind == CapKind.group)
assert(state.captures[state.captop - 1]:isopencap())
-- make group capture an anonymous group (this used to hold match-time f)
state.captures[state.captop - 1].extra = nil
for i=2,funcRet.n do -- add runtime captures
state:pushcapture(CapKind.runtime:newCapture(state.bytePos, 0, funcRet[i]))
end
-- close group
state:pushcapture(CapKind.close:newCapture(state.bytePos, 0))
end
return self.next:exec(state)
end
local MAXLOP = 20
function findopen(captures, i, currPos)
i = i - 1 -- check last captop
local cap = captures[i]
if (not cap:isopencap()) and cap.bytePos == currPos then
return nil -- current one cannot be a full capture
end
-- else, look for an 'open' capture
for j=1,MAXLOP do
if cap:isopencap() then -- open capture?
return cap,i -- that's the one to be closed
elseif cap.kind == CapKind.close then
return nil -- a full capture should not nest a non-full one
end
i = i - 1
if i<1 then break end
cap = captures[i]
end
return nil -- not found within allowed search limit
end
function Opcode.CloseCapture:exec(state)
-- if possible, turn capture into a full capture
assert(state.captop > 1)
local open,_ = findopen(state.captures, state.captop, state.bytePos)
if open ~= nil then -- if possible, turn capture into a full capture
open.byteLen = state.bytePos - open.bytePos
else
-- non-nil length to mark entry as closed
state:pushcapture(self.cap:newCapture(state.bytePos, 0))
end
return self.next:exec(state)
end
function Opcode.OpenCapture:exec(state)
state:pushcapture(self.cap:newCapture(
-- byteLen = nil marks entry as open
state.bytePos, nil, self.key
))
return self.next:exec(state)
end
function Opcode.FullCapture:exec(state)
-- XXX SLOW: self.aux is in *characters* not *bytes*
local nPos = utf8util.offset(state.source, -self.aux, state.bytePos)
state:pushcapture(self.cap:newCapture(
nPos, state.bytePos - nPos, self.key
))
return self.next:exec(state)
end
function match(s, init, code, ...)
local state = State:new(s, init, ...)
return code[1]:exec(state)
end
return {
match = match,
}
end)
register('llpeg', function(myrequire)
local VERSION = '0.0.1'
local MAXSTACK = 400 -- maximum backtracking
local MAXBEHIND = 255 -- maximum look-behind
local MAXRULES = 1000 -- maximum number of rules
local LPEG_COMPAT = true
myrequire('strict')
local compat = myrequire('advent.compat')
local from = myrequire('llpeg.types').from
local
CHARMAX,
CapKind,
TTag,
define,
define_tree_visitor,
metareg,
newcharset,
numsiblings,
_ = from(myrequire('llpeg.types'), {
'CHARMAX',
'CapKind',
'TTag',
'define',
'define_tree_visitor',
'metareg',
'newcharset',
'numsiblings',
})
local
compile,
cs_diff,
cs_union,
fixedlen,
hascaptures,
nofail,
nullable,
nullable_with_grammar,
tocharset,
__ = from(myrequire('llpeg.code'), {
'compile',
'cs_diff',
'cs_union',
'fixedlen',
'hascaptures',
'nofail',
'nullable',
'nullable_with_grammar',
'tocharset',
})
local
match,
___ = from(myrequire('llpeg.vm'), {
'match',
})
local
printpatt,
printrepl,
printtree,
____ = from(myrequire('llpeg.print'), {
'printpatt',
'printrepl',
'printtree',
})
function checkint(v)
if type(v) == 'string' then
v = tonumber(v)
end
if type(v) ~= "number" then
error("not a number")
end
return math.floor(v)
end
local is_pattern = define_type_visitor{
pattern = function() return true end,
default = function() return false end,
}
local ptype = define_type_visitor{
pattern = function() return "pattern" end,
default = function(v) return type(v) end,
}
function val2str(v)
if type(v) == 'number' then return tostring(v) end
if type(v) == 'string' then return v end
return string.format("(a %s)", ptype(v))
end
--[[ lpltree.c ]]--
function newtree(tag)
local t = {
tag = tag,
code = nil,
}
setmetatable(t, metareg)
return t
end
function newleaf(tag, n)
return setmetatable({
tag = tag,
code = nil,
n = n,
}, metareg)
end
function newroot1sib(tag, sib1)
return setmetatable({
tag = tag,
code = nil,
sib1 = sib1,
}, metareg)
end
function newroot2sib(tag, sib1, sib2)
return setmetatable({
tag = tag,
code = nil,
sib1 = sib1,
sib2 = sib2,
}, metareg)
end
--[[ Build a sequence of #s nodes from the array 's' with the tag 'tag' ]]--
function fillseq(tag, s)
if type(s) == 'number' then
local len = checkint(s)
s = setmetatable({}, {__len = function() return len end})
end
if #s == 0 then
return newleaf(tag, 0)
end
local i = #s
local result = newleaf(tag, s[i])
while i > 1 do
i = i - 1
result = newroot2sib(
TTag.Seq,
newleaf(tag, s[i]),
result
)
end
return result
end
--[[ Numbers as patterns:
0 == true (always match); n == TAny repeated 'n' times;
-n == not (TAny repeated 'n' times)
]]--
function numtree(n)
n = checkint(n)
if n == 0 then
return newleaf(TTag.True)
elseif n > 0 then
return fillseq(TTag.Any, n) -- sequence of 'n' anys
else
return newroot1sib(TTag.Not, fillseq(TTag.Any, -n))
end
end
-- Convert value v to a pattern
local getpatt = define_type_visitor{
["string"] = function(s)
if #s == 0 then
return newleaf(TTag.True) -- always match if string is empty
end
local cp = {}
for _,c in compat.utf8codes(s) do
table.insert(cp, c)
end
return fillseq(TTag.Char, cp)
end,
["number"] = function(n)
return numtree(n)
end,
["boolean"] = function(b)
if b then
return newleaf(TTag.True)
else
return newleaf(TTag.False)
end
end,
["function"] = function(f)
return setmetatable({
tag = TTag.RunTime,
code = nil,
key = f,
sib1 = newleaf(TTag.True),
}, metareg)
end,
["pattern"] = function(v)
return v
end,
["table"] = function(v)
return newgrammar(v)
end,
default = function(v)
error("Not a pattern")
end,
}
-- labeled failure begin
function newthrowleaf(label)
return setmetatable({
tag = TTag.Throw,
code = nil,
sib2 = nil, -- no recovery rule associated (yet)
key = label,
}, metareg)
end
-- labeled failure end
function lp_P(v)
return getpatt(v)
end
--[[
** sequence operator; optimizations:
** false x => false, x true => x, true x => x
** (cannot do x . false => false because x may have runtime captures)
]]--
function lp_seq(tree1, tree2)
tree1 = getpatt(tree1)
tree2 = getpatt(tree2)
if tree1.tag == TTag.False or tree2.tag == TTag.True then
-- false . x = false, x . true = x
return tree1
elseif tree1.tag == TTag.True then
-- true . x = x
return tree2
else
return newroot2sib(TTag.Seq, tree1, tree2)
end
end
--[[
** choice operator; optimizations:
** charset / charset => charset
** true / x => true, x / false => x, false / x => x
** (x / true is not equivalent to true)
]]--
function lp_choice(t1, t2)
t1 = getpatt(t1)
t2 = getpatt(t2)
local t1c = tocharset(t1)
local t2c = tocharset(t2)
if t1c ~= nil and t2c ~= nil then
local t = cs_union(t1c, t2c)
return t
elseif nofail(t1) or t2.tag == TTag.False then
-- true / x => true, x / false => x
return t1
elseif t1.tag == TTag.False then
-- false / x => x
return t2
else
return newroot2sib(TTag.Choice, t1, t2)
end
end
--[[
p^n
]]--
function lp_star(p, n)
local tree1 = getpatt(p)
n = checkint(n)
if n >= 0 then -- seq tree1 (seq tree1 ... (seq tree1 (rep tree1)))
if nullable(tree1) then
error("loop body may accept empty string")
end
local tree = newroot1sib(TTag.Rep, tree1)
while n > 0 do
tree = newroot2sib(TTag.Seq, tree1, tree)
n = n - 1
end
return tree
else -- choice (seq tree1 ... choice tree1 true ...) true
n = -n
local tree = newroot2sib( -- at most 1
TTag.Choice,
tree1,
newleaf(TTag.True)
)
while n > 1 do
tree = newroot2sib( -- at most (n-1)
TTag.Seq,
tree1,
tree
)
tree = newroot2sib(TTag.Choice, tree, newleaf(TTag.True))
n = n - 1
end
return tree
end
end
--[[
** #p == &p
]]--
function lp_and(v)
return newroot1sib(TTag.And, getpatt(v))
end
--[[
** -p == !p
]]--
function lp_not(v)
return newroot1sib(TTag.Not, getpatt(v))
end
--[[
** [t1 - t2] == Seq (Not t2) t1
** If t1 and t2 are charsets, make their difference.
]]--
function lp_sub(t1, t2)
t1 = getpatt(t1)
t2 = getpatt(t2)
local t1c = tocharset(t1)
local t2c = tocharset(t2)
if t1c ~= nil and t2c ~= nil then
return cs_diff(t1c, t2c)
else
return newroot2sib(
TTag.Seq,
newroot1sib(TTag.Not, t2),
t1
)
end
end
--[[
A set with the given characters
]]--
function lp_set(s)
local t = newcharset()
local extra = nil
for _,c in compat.utf8codes(s) do
if c > CHARMAX then
-- non ascii, we can't use charset for these
local one = newleaf(TTag.Char, c)
if extra == nil then
extra = one
else
extra = newroot2sib(TTag.Choice, extra, one)
end
else
t.set[c] = true
end
end
if extra == nil then
return t
else
return newroot2sib(TTag.Choice, t, extra)
end
end
function lp_range(...)
local t = newcharset()
local extra = nil
for _,v in ipairs{...} do
if type(v) ~= "string" then
error("argument must be string")
else
local first, second
for _,c in compat.utf8codes(v) do
if first == nil then
first = c
elseif second == nil then
second = c
else
error("range must have two characters")
end
end
if first == nil or second == nil then
error("range must have two characters")
end
if first > second then
if LPEG_COMPAT then
-- ignore, just silently create an empty range
else
error("empty range")
end
elseif second <= CHARMAX then -- ascii range
for i = first, second do
t.set[i] = true
end
else
local r = lp_utfr(first, second)
if extra == nil then
extra = r
else
extra = newroot2sib(TTag.Choice, extra, one)
end
end
end
end
if extra == nil then
return t
else
return newroot2sib(TTag.Choice, t, extra)
end
end
function lp_utfr(from, to)
from = checkint(from)
to = checkint(to)
if from > to then
error("empty range")
end
if to > 0x10FFFF then
error("invalid code point")
end
if to <= CHARMAX then -- ascii range?
local t = newcharset() -- code it as a regular charset
for i = from, to do
t.set[i] = true
end
return t
end
-- multibyte utf-8 range
return setmetatable({
tag = TTag.UTFR,
code = nil,
from = from,
to = to,
}, metareg)
end
--[[
Look-behind predicate
]]--
function lp_behind(v)
local tree1 = getpatt(v)
local n = fixedlen(tree1)
if n < 0 then
error("pattern may not have fixed length")
end
if hascaptures(tree1) then
error("pattern has captures")
end
if n > MAXBEHIND then
error("pattern too long to look behind")
end
return setmetatable({
tag = TTag.Behind,
code = nil,
sib1 = tree1,
n = n,
}, metareg)
end
--[[ labeled failure begin ]]--
--[[
** Throws a label
]]--
local lp_throw = define_type_visitor{
[{"string","number"}] = newthrowleaf,
default = function() error("not a string") end,
}
--[[ labeled failure end ]]--
--[[
** Create a non-terminal
]]--
function lp_V(v)
if v == nil then
error("non-nil value expected")
end
return setmetatable({
tag = TTag.Call,
code = nil,
key = v,
}, metareg)
end
--[[
** Create a tree for a non-empty capture, with a body and
** optionally with an associated Lua value (at index 'labelidx' in the
** stack)
]]--
function capture_aux(capkind, patt, val)
local t = newroot1sib(TTag.Capture, getpatt(patt))
t.cap = capkind
t.key = val
return t
end
function newemptycap(capkind, val)
return capture_aux(capkind, newleaf(TTag.True), val)
end
--[[
** Captures with syntax p / v
** (function capture, query capture, string capture, or number capture)
]]--
local divcapture_helper = define_type_visitor{
["function"] = function(v, p)
return capture_aux(CapKind["function"], p, v)
end,
["table"] = function(v, p)
return capture_aux(CapKind.query, p, v)
end,
["string"] = function(v, p)
return capture_aux(CapKind.string, p, v)
end,
["number"] = function(v, p)
v = checkint(v)
if v < 0 or v > 65536 then
error("invalid number")
end
return capture_aux(CapKind.num, p, v)
end,
default = function(v)
error("unexpected "..ptype(v).." as 2nd operand to LPeg '/'") end,
}
function lp_divcapture(p, v)
return divcapture_helper(v, p) -- dispatch on v
end
function lp_acccapture(p, v)
return capture_aux(CapKind.acc, p, v)
end
-- the match for patt with the values from nested captures replacing their
-- matches
function lp_substcapture(patt)
return capture_aux(CapKind.subst, patt)
end
-- a table with all captures from patt
function lp_tablecapture(patt)
return capture_aux(CapKind.table, patt)
end
-- the values produced by patt, optionally tagged with key
function lp_groupcapture(patt, key)
-- key can be nil
return capture_aux(CapKind.group, patt, key)
end
-- folding capture (deprecated)
function lp_foldcapture(patt, func)
if type(func) ~= "function" then
error("Bad function argument")
end
return capture_aux(CapKind.fold, patt, func)
end
-- the match for patt plus all captures made by patt
function lp_simplecapture(patt)
return capture_aux(CapKind.simple, patt)
end
-- the current position (matches the empty string)
function lp_poscapture()
return newemptycap(CapKind.position)
end
-- the value of the nth extra argument to lpeg.match (matches the empty string)
function lp_argcapture(n)
n = checkint(n)
if n <= 0 or n > 65536 then error("invalid argument index") end
return newemptycap(CapKind.arg, n)
end
-- the value produced by the previous group capture named `key`
-- (matches the empty string)
function lp_backref(key)
return newemptycap(CapKind.backref, key)
end
-- Constant capture (matches the empty string)
function lp_constcapture(...)
local arg = compat.pack(...)
if arg.n == 0 then -- no values?
return newleaf(TTag.True) -- no capture
else
local i = arg.n
local tree = newemptycap(CapKind.const, arg[i])
while i > 1 do
i = i - 1
tree = newroot2sib(
TTag.Seq,
newemptycap(CapKind.const, arg[i]),
tree
)
end
if arg.n == 1 then
-- single constant capture
return tree
else
-- create a group capture with all values
return lp_groupcapture( tree )
end
end
end
-- the returns of function applied to the captures of patt; the application
-- is done at match time
function lp_matchtime(patt, func)
if type(func) ~= 'function' then
error('not a function')
end
return setmetatable({
tag = TTag.RunTime,
code = nil,
key = func,
sib1 = getpatt(patt),
}, metareg)
end
--[[======================================================]]--
--[[
** ======================================================
** Grammar - Tree generation
** ======================================================
]]--
--[[
** push on the stack the index and the pattern for the
** initial rule of grammar at index 'arg' in the stack;
** also add that index into position table.
]]--
function getfirstrule(tbl)
local first_name, first_rule
first_name = tbl[1]
-- is this the name of an initial rule?
if type(first_name) == 'number' or type(first_name) == 'string' then
first_rule = tbl[first_name] -- get associated rule
else
first_name,first_rule = 1,first_name
end
if not is_pattern(first_rule) then
if first_rule == nil then
error("grammar has no initial rule")
else
error(string.format("initial rule '%s' is not a pattern", first_name))
end
end
-- rule position (after TGrammar)
-- return map from name to position, and from position to name
return { [first_name] = 1 }, { first_name }
end
--[[
** traverse grammar at index 'arg', pushing all its keys and patterns
** into the stack. Create a new table (before all pairs key-pattern) to
** collect all keys and their associated positions in the final tree
** (the "position table").
** Return the number of rules and (in 'totalsize') the total size
** for the new tree.
]]--
function collectrules(tbl)
-- find the first rule and put in position table
local postab, rpostab = getfirstrule(tbl)
-- collect and sort rule names (for repeatability)
local names = {}
for k,v in pairs(tbl) do
if k == 1 or postab[k] == 1 then -- initial rule?
-- skip the initial rules, it's already in the position table
else
table.insert(names, k)
end
end
table.sort(names, function(a,b)
return tostring(a) < tostring(b)
end)
-- fill out rule, name, and position maps
for _,k in ipairs(names) do
local v = tbl[k]
if not is_pattern(v) then
error("rule '" .. val2str(k) .. "' is not a pattern")
end
table.insert(rpostab, k)
postab[k] = #rpostab
end
return postab, rpostab
end
function buildgrammar(g, tbl, postab, rpostab)
local trees = {}
for i,name in ipairs(rpostab) do
local rule = setmetatable({
tag = TTag.Rule,
code = nil,
key = nil, -- will be fixed when rule is used
n = i, -- rule number
name = name,
sib1 = tbl[name], -- pattern
sib2 = nil,
}, metareg)
table.insert(trees, rule)
g.ruletab[name] = rule
end
-- link up siblings
for i = 1, #trees-1 do
trees[i].sib2 = trees[i+1]
end
trees[#trees].sib2 = newleaf(TTag.True) -- finish list of rules
g.sib1 = trees[1]
end
--[[
** Check whether a tree has potential infinite loops
]]--
function checkloops(grammar, tree)
local n = numsiblings[tree.tag]
if tree.tag == TTag.Rep and nullable_with_grammar(tree.sib1, grammar) then
return true
elseif tree.tag == TTag.Grammar then
return false -- sub-grammars already checked
elseif n == 1 then
return checkloops(grammar, tree.sib1) -- tail call
elseif n == 2 then
if checkloops(grammar, tree.sib1) then
return true
else
return checkloops(grammar, tree.sib2) -- tail call
end
elseif n == 0 then
return false
else
error("surprising number of siblings")
end
end
--[[
** Give appropriate error message for 'verifyrule'. If a rule appears
** twice in 'passed', there is path from it back to itself without
** advancing the subject.
]]--
function verifyerror(grammar, passed, npassed)
local i, j
for i = npassed,1,-1 do -- search for a repetition
for j = i-1,1,-1 do
if passed[i] == passed[j] then
error(string.format("rule '%s' may be left recursive", val2str(passed[i])))
end
end
end
error("too many left calls in grammar")
end
--[[
** Check whether a rule can be left recursive; raise an error in that
** case; otherwise return 1 iff pattern is nullable.
** The return value is used to check sequences, where the second pattern
** is only relevant if the first is nullable.
** Parameter 'nb' works as an accumulator, to allow tail calls in
** choices. ('nb' true makes function returns true.)
** Parameter 'passed' is a list of already visited rules, 'npassed'
** counts the elements in 'passed'.
** Assume ktable at the top of the stack.
]]--
local verifyrule
verifyrule = define_tree_visitor{
[{
TTag.Char, TTag.Set, TTag.Any, TTag.False, TTag.UTFR,
TTag.Throw, -- labeled failure
}] = function(tree, g, passed, n, nb)
return nb -- cannot pass from here
end,
[{
TTag.True, TTag.Behind, -- look-behind cannot have calls
}] = function(tree, g, passed, n, nb)
return true
end,
[{ TTag.Not, TTag.And, TTag.Rep, }] = function(tree, g, passed, n, nb)
return verifyrule(tree.sib1, g, passed, n, true) -- tail call
end,
[{ TTag.Capture, TTag.RunTime, TTag.XInfo, }] = function(tree, g, passed, n, nb)
return verifyrule(tree.sib1, g, passed, n, nb) -- tail call
end,
[ TTag.Call ] = function(tree, g, passed, n, nb)
local rule = g.ruletab[tree.key] -- look up rule
return verifyrule(rule, g, passed, n, nb) -- tail call
end,
[ TTag.Seq ] = function(tree, g, passed, n, nb)
-- only check 2nd child if first is nb
if not verifyrule(tree.sib1, g, passed, n, false) then
return nb
else
-- note that we don't propagate new npassed from 1st child
return verifyrule(tree.sib2, g, passed, n, nb) -- tail call
end
end,
[ TTag.Choice ] = function(tree, g, passed, n, nb)
-- must check both children
nb = verifyrule(tree.sib1, g, passed, n, nb)
-- note that we don't propagate new npassed from 1st child
return verifyrule(tree.sib2, g, passed, n, nb) -- tail call
end,
[ TTag.Rule ] = function(tree, g, passed, n, nb)
if n >= MAXRULES then -- too many steps?
return verifyerror(g, passed, n) -- error
else
passed[n+1] = tree.key -- add rule to path
return verifyrule(tree.sib1, g, passed, n + 1, nb) -- tail call
end
end,
[ TTag.Grammar ] = function(tree, g, passed, n, nb)
return nullable(tree) -- sub-grammar cannot be left recursive
end,
}
function verifygrammar(grammar)
local passed = {}
-- check left-recursive rules
local rule = grammar.sib1
while rule.tag == TTag.Rule do
if rule.key ~= nil then -- skip unused rules
verifyrule(rule.sib1, grammar, passed, 0, false)
end
rule = rule.sib2
end
if rule.tag ~= TTag.True then
error("assertion failure")
end
-- check infinite loops inside rules
rule = grammar.sib1
while rule.tag == TTag.Rule do
if rule.key ~= nil then -- skip unused rules
if checkloops(grammar, rule.sib1) then
error("empty loop in rule '" .. val2str(rule.name) .. "'")
end
end
rule = rule.sib2
end
if rule.tag ~= TTag.True then
error("assertion failure")
end
end
--[[
** Fix a TOpenCall into a TCall node, using table 'postable' to
** translate a key to its rule address in the tree. Raises an
** error if key does not exist.
]]--
function fixonecall(g, t, postab)
local name = t.key
local rule = g.ruletab[name]
if t.tag == TTag.Call then
if rule == nil then
error(string.format("rule '%s' undefined in given grammar", val2str(name)))
end
-- unlike our upstream, we don't clone patterns when we build a grammar
-- so we can't mutate this tree w/o possibly breaking any other grammars
-- which might hold an alias of this call. So we don't distinguish
-- Call and OpenCall and we don't mutate the tag here and
-- don't link it up. However, we can mutate the Rule
-- as those are not shared
rule.key = name -- mark this as used
elseif rule ~= nil then -- TTag.Throw
-- As before, we can't mutate the tree
rule.key = name -- mark this as used
end
end
--[[
** Transform left associative constructions into right
** associative ones, for sequence and choice; that is:
** (t11 + t12) + t2 => t11 + (t12 + t2)
** (t11 * t12) * t2 => t11 * (t12 * t2)
** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2))
]]--
function correctassociativity (tree)
local tag = tree.tag
if tag ~= TTag.Choice and tag ~= TTag.Seq then
error("impossible")
end
local t1 = tree.sib1
while t1.tag == tree.tag do
local t11, t12 = t1.sib1, t1.sib2
local t2 = tree.sib2
-- don't mutate t1 in place as others may be keeping copies of it;
-- mutating 'tree' in place is okay as we're not changing its semantics
tree.sib1 = t11
tree.sib2 = newroot2sib(tag, t12, t2)
t1 = tree.sib1
end
return tree
end
--[[
** Make final adjustments in a tree. Fix open calls in tree 't',
** making them refer to their respective rules or raising appropriate
** errors (if not inside a grammar). Correct associativity of associative
** constructions (making them right associative). Assume that tree's
** ktable is at the top of the stack (for error messages).
]]--
local finalfix_helper = define_tree_visitor{
[TTag.Grammar] = function(t)
return t -- subgrammars were already fixed
end,
[TTag.Call] = function(t, g, postab)
if g == nil then
error("rule '" .. val2str(t.key) .. "' used outside a grammar")
else
return fixonecall(g, t, postab)
end
end,
[TTag.Throw] = function(t, g, postab)
if g ~= nil then
return fixonecall(g, t, postab)
end
end,
[{TTag.Seq, TTag.Choice}] = function(t, g, postab)
return correctassociativity(t)
end,
default = function(t) return t end,
}
function finalfix(g, t, postab)
finalfix_helper(t, g, postab)
if t.tag == TTag.Grammar then return end
local n = numsiblings[t.tag]
if n == 1 then
return finalfix(g, t.sib1, postab) -- tail call
elseif n == 2 then
finalfix(g, t.sib1, postab)
return finalfix(g, t.sib2, postab) -- tail call
elseif n == 0 then
return
else
error("strange number of siblings")
end
end
--[[
** Give a name for the initial rule if it is not referenced
]]--
function initialrulename(grammar)
if grammar.sib1.key == nil then -- initial rule is not referenced?
grammar.sib1.key = grammar.sib1.name
end
end
function newgrammar(tbl)
local postab, rpostab = collectrules(tbl)
local g = setmetatable({
tag = TTag.Grammar,
code = nil,
sib1 = nil, -- will fill this in
n = #rpostab, -- number of rules
ruletab = {}, -- map rule names to rules
}, metareg)
buildgrammar(g, tbl, postab, rpostab)
finalfix(g, g.sib1, postab)
initialrulename(g)
verifygrammar(g)
return g
end
--[[ ====================================================== ]]--
function prepcompile(p)
finalfix(nil, p, {}) -- correct associativity
return compile(p)
end
function lp_printtree(patt, c)
local tree = getpatt(patt)
if c then
finalfix(nil, tree, {}) -- correct associativity
end
print("[]") -- for compatibility, this is a fake 'ktable'
io.write(table.concat(printtree(tree, 0, {})))
end
function lp_printcode(patt)
local p = getpatt(patt)
if p.code == nil then
prepcompile(p)
end
print("[]") -- for compatibility, this is a fake 'ktable'
io.write(table.concat(printpatt(p.code, {})))
end
--[[
** Get the initial position for the match, interpreting negative
** values from the end of the subject. Result is 1-based.
]]--
function initposition(ii, len)
if ii > 0 then -- positive index?
if ii <= len then -- inside the string?
return ii -- return it (no correction to 0-base)
else
return len + 1 -- crop at the end
end
else -- negative index
if (-ii) <= len then -- inside the string?
return len + 1 - (-ii) -- return position from the end
else
return 1
end
end
end
-- Main match function
function lp_match(pattern, subject, init, ...)
local p = getpatt(pattern)
if p.code == nil then prepcompile(p) end
local code = p.code
if type(subject) ~= 'string' then error("subject is not a string") end
local i
if init == nil then
i = 1
else
i = initposition(checkint(init), #subject)
end
return match(subject, i, code, ...)
end
--[[
** ======================================================
** Library creation and functions not related to matching
** ======================================================
]]--
function lp_setmax(lim)
lim = 0 + lim -- convert to integer
if lim <= 0 then
error("out of range")
end
MAXSTACK = lim
end
local lp_type = define_type_visitor{
pattern = function() return "pattern" end,
default = function() return nil end,
}
function lp_gc(p)
p._code = nil
end
function createcat(charspec)
local t = newcharset()
for i=0,CHARMAX do -- XXX not unicode safe
local s = compat.utf8char(i)
if s:find(charspec) ~= nil then
t.set[i] = true
end
end
return t
end
function lp_locale(tbl)
if tbl == nil then
tbl = {}
end
tbl.alnum = createcat("%w")
tbl.alpha = createcat("%a")
tbl.cntrl = createcat("%c")
tbl.digit = createcat("%d")
tbl.graph = createcat("[%p%w]") -- printable except space
tbl.lower = createcat("%l")
tbl.print = createcat("%C") -- printable = "not a control character"?
tbl.punct = createcat("%p") -- "printable but not space or alnum
tbl.space = createcat("%s")
tbl.upper = createcat("%u")
tbl.xdigit = createcat("%x")
return tbl
end
--[[ lpltree.c ]]--
metareg.__mul = lp_seq
metareg.__add = lp_choice
metareg.__pow = lp_star
metareg.__gc = lp_gc
metareg.__len = lp_and
metareg.__div = lp_divcapture
metareg.__mod = lp_acccapture
metareg.__unm = lp_not
metareg.__sub = lp_sub
metareg.__tostring = printrepl
local pattreg = {
ptree = lp_printtree,
pcode = lp_printcode,
match = lp_match,
B = lp_behind,
V = lp_V,
C = lp_simplecapture,
Cc = lp_constcapture,
Cmt = lp_matchtime,
Cb = lp_backref,
Carg = lp_argcapture,
Cp = lp_poscapture,
Cs = lp_substcapture,
Ct = lp_tablecapture,
Cf = lp_foldcapture,
Cg = lp_groupcapture,
P = lp_P,
S = lp_set,
R = lp_range,
utfR = lp_utfr,
locale = lp_locale,
version = "LLPegLabel " .. VERSION,
setmaxstack = lp_setmax,
type = lp_type,
T = lp_throw, -- labeled failure throw
}
metareg.__index = pattreg
return pattreg
end)
local modules = {}
modules['bit32'] = require('bit32')
modules['string'] = require('string')
modules['strict'] = {}
modules['table'] = require('table')
local function myrequire(name)
if modules[name] == nil then
modules[name] = true
modules[name] = (builders[name])(myrequire)
end
return modules[name]
end
return myrequire('llpeg')
end)()