Regular expression examples

Regular expression examples

(subject.size.vec <- unique(as.integer(10^seq(0,3.5,l=100))))
#>  [1]    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
#> [16]   17   18   20   22   23   25   28   30   33   35   38   42   45   49   53
#> [31]   58   63   68   74   81   87   95  103  112  121  132  143  155  168  183
#> [46]  198  215  233  253  275  298  323  351  380  413  448  486  527  572  620
#> [61]  673  730  792  859  932 1011 1097 1190 1291 1401 1519 1648 1788 1940 2104
#> [76] 2283 2477 2687 2915 3162
(backtrackers <- c(
  if(requireNamespace("stringi"))atime::atime_grid(
    ICU=stringi::stri_match(subject, regex=pattern)),
  atime::atime_grid(
    PCRE=regexpr(pattern, subject, perl=TRUE),
    TRE=regexpr(pattern, subject, perl=FALSE))))
#> Loading required namespace: stringi
#> $ICU
#> stringi::stri_match(subject, regex = pattern)
#> 
#> $PCRE
#> regexpr(pattern, subject, perl = TRUE)
#> 
#> $TRE
#> regexpr(pattern, subject, perl = FALSE)
backtrackers.result <- atime::atime(
  N=subject.size.vec,
  setup={
    subject <- paste(rep("a", N), collapse="")
    pattern <- paste(rep(c("(a)?", "\\1"), each=N), collapse="")
  },
  expr.list=backtrackers)
backtrackers.best <- atime::references_best(backtrackers.result)
plot(backtrackers.best)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning in grid.Call.graphics(C_polygon, x$x, x$y, index): semi-transparency is
#> not supported on this device: reported only once per page

plot of chunk unnamed-chunk-1

The plot above shows that ICU/PCRE/TRE are all exponential in N (subject/pattern size) when the pattern contains backreferences.

all.exprs <- c(
  if(requireNamespace("re2"))atime::atime_grid(
    RE2=re2::re2_match(subject, pattern)),
  backtrackers)
#> Loading required namespace: re2
all.result <- atime::atime(
  N=subject.size.vec,
  setup={
    subject <- paste(rep("a", N), collapse="")
    pattern <- paste(rep(c("a?", "a"), each=N), collapse="")
  },
  expr.list=all.exprs)
all.best <- atime::references_best(all.result)
plot(all.best)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning in grid.Call.graphics(C_polygon, x$x, x$y, index): semi-transparency is
#> not supported on this device: reported only once per page

plot of chunk unnamed-chunk-2

The plot above shows that ICU/PCRE are exponential time whereas RE2/TRE are polynomial time. Exercise for the reader: modify the above code to use the seconds.limit argument so that you can see what happens to ICU/PCRE for larger N (hint: you should see a difference at larger sizes).

atime_grid to compare different engines

In the nc package there is an engine argument which controls which C regex library is used:

nc.exprs <- atime::atime_grid(
  list(ENGINE=c(
    ##if(requireNamespace("re2"))"RE2",#uncomment when new nc on CRAN.
    "PCRE",
    if(requireNamespace("stringi"))"ICU")),
  nc=nc::capture_first_vec(subject, pattern, engine=ENGINE))
nc.result <- atime::atime(
  N=subject.size.vec,
  setup={
    rep.collapse <- function(chr)paste(rep(chr, N), collapse="")
    subject <- rep.collapse("a")
    pattern <- list(maybe=rep.collapse("a?"), rep.collapse("a"))
  },
  expr.list=nc.exprs)
nc.best <- atime::references_best(nc.result)
plot(nc.best)
#> Warning in grid.Call.graphics(C_polygon, x$x, x$y, index): semi-transparency is
#> not supported on this device: reported only once per page

plot of chunk unnamed-chunk-3

The result/plot above is consistent with the previous result.