(subject.size.vec <- unique(as.integer(10^seq(0,3.5,l=100))))
#> [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#> [16] 17 18 20 22 23 25 28 30 33 35 38 42 45 49 53
#> [31] 58 63 68 74 81 87 95 103 112 121 132 143 155 168 183
#> [46] 198 215 233 253 275 298 323 351 380 413 448 486 527 572 620
#> [61] 673 730 792 859 932 1011 1097 1190 1291 1401 1519 1648 1788 1940 2104
#> [76] 2283 2477 2687 2915 3162
(backtrackers <- c(
if(requireNamespace("stringi"))atime::atime_grid(
ICU=stringi::stri_match(subject, regex=pattern)),
atime::atime_grid(
PCRE=regexpr(pattern, subject, perl=TRUE),
TRE=regexpr(pattern, subject, perl=FALSE))))
#> Loading required namespace: stringi
#> $ICU
#> stringi::stri_match(subject, regex = pattern)
#>
#> $PCRE
#> regexpr(pattern, subject, perl = TRUE)
#>
#> $TRE
#> regexpr(pattern, subject, perl = FALSE)
backtrackers.result <- atime::atime(
N=subject.size.vec,
setup={
subject <- paste(rep("a", N), collapse="")
pattern <- paste(rep(c("(a)?", "\\1"), each=N), collapse="")
},
expr.list=backtrackers)
backtrackers.best <- atime::references_best(backtrackers.result)
plot(backtrackers.best)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning in grid.Call.graphics(C_polygon, x$x, x$y, index): semi-transparency is
#> not supported on this device: reported only once per page
The plot above shows that ICU/PCRE/TRE are all exponential in N (subject/pattern size) when the pattern contains backreferences.
all.exprs <- c(
if(requireNamespace("re2"))atime::atime_grid(
RE2=re2::re2_match(subject, pattern)),
backtrackers)
#> Loading required namespace: re2
all.result <- atime::atime(
N=subject.size.vec,
setup={
subject <- paste(rep("a", N), collapse="")
pattern <- paste(rep(c("a?", "a"), each=N), collapse="")
},
expr.list=all.exprs)
all.best <- atime::references_best(all.result)
plot(all.best)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning in grid.Call.graphics(C_polygon, x$x, x$y, index): semi-transparency is
#> not supported on this device: reported only once per page
The plot above shows that ICU/PCRE are exponential time whereas
RE2/TRE are polynomial time. Exercise for the reader: modify the above
code to use the seconds.limit
argument so that you can see what
happens to ICU/PCRE for larger N (hint: you should see a difference at
larger sizes).
atime_grid
to compare different enginesIn the nc
package there is an engine
argument which controls which
C regex library is used:
nc.exprs <- atime::atime_grid(
list(ENGINE=c(
##if(requireNamespace("re2"))"RE2",#uncomment when new nc on CRAN.
"PCRE",
if(requireNamespace("stringi"))"ICU")),
nc=nc::capture_first_vec(subject, pattern, engine=ENGINE))
nc.result <- atime::atime(
N=subject.size.vec,
setup={
rep.collapse <- function(chr)paste(rep(chr, N), collapse="")
subject <- rep.collapse("a")
pattern <- list(maybe=rep.collapse("a?"), rep.collapse("a"))
},
expr.list=nc.exprs)
nc.best <- atime::references_best(nc.result)
plot(nc.best)
#> Warning in grid.Call.graphics(C_polygon, x$x, x$y, index): semi-transparency is
#> not supported on this device: reported only once per page
The result/plot above is consistent with the previous result.