diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e154fe1..0000000 --- a/.travis.yml +++ /dev/null @@ -1,37 +0,0 @@ - -# travis config - -sudo: required - -language: r -cache: packages - -addons: - apt: - packages: - - libxml2-dev - -r: - - release - - devel - -before_install: - - R -e "install.packages(c('devtools','roxygen2','testthat'))" - - R -e "devtools::install_deps('./pkg')" - - ./document.sh - - cd ./pkg - -r_packages: - - covr - - rmarkdown - - -after_success: - - Rscript -e 'library(covr);coveralls()' - -notifications: - email: - on_success: change - on_failure: change - - diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3aaefa5 --- /dev/null +++ b/Makefile @@ -0,0 +1,39 @@ + +doc: + R -s -e "pkgload::load_all('pkg');roxygen2::roxygenize('pkg')" + +pkg: doc + R CMD build pkg + +install: pkg + R CMD INSTALL *.tar.gz + +check: doc + R CMD build pkg + R CMD check *.tar.gz + +cran: doc + R CMD build pkg + R CMD check --as-cran *.tar.gz + +test: doc + R -s -e "tinytest::build_install_test('pkg')" + +manual: doc + R CMD Rd2pdf --force -o manual.pdf ./pkg + +revdep: pkg + rm -rf revdep + mkdir revdep + mv *.tar.gz revdep + R -s -e "out <- tools::check_packages_in_dir('revdep',reverse=list(which='most'),Ncpus=3); print(summary(out)); saveRDS(out, file='revdep/output.RDS')" + +covr: + R -e 'covr::package_coverage("./pkg")' + +clean: + rm -rf stringdist.Rcheck + rm -rf revdep + rm -f *.tar.gz + + diff --git a/README.md b/README.md index ab37f4c..cbf7e3b 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,39 @@ -[![Build Status](https://travis-ci.org/markvanderloo/stringdist.svg?branch=master)](https://travis-ci.org/markvanderloo/stringdist) -[![Coverage Status](https://coveralls.io/repos/markvanderloo/stringdist/badge.svg)](https://coveralls.io/r/markvanderloo/stringdist) [![CRAN](http://www.r-pkg.org/badges/version/stringdist)](http://cran.r-project.org/web/packages/stringdist/NEWS) +[![status](https://tinyverse.netlify.com/badge/stringdist)](https://CRAN.R-project.org/package=stringdist) [![Downloads](http://cranlogs.r-pkg.org/badges/stringdist)](http://cran.r-project.org/package=stringdist/)[![Research software impact](http://depsy.org/api/package/cran/stringdist/badge.svg)](http://depsy.org/package/r/stringdist)[![Mentioned in Awesome Official Statistics ](https://awesome.re/mentioned-badge.svg)](http://www.awesomeofficialstatistics.org) + + ## stringdist -* Approximate matching and string distance calculations for R. +* Approximate matching, fuzzy text search, and string distance calculations for R. * All distance and matching operations are system- and encoding-independent. +* Built for speed, using [openMP](https://www.openmp.org/) for parallel computing. + + +## Citing + +Please cite the [R-Journal article](https://journal.r-project.org/archive/2014/RJ-2014-011/index.html) + +``` +@article{RJ-2014-011, + author = {Mark P.J. van der Loo}, + title = {{The stringdist Package for Approximate String Matching}}, + year = {2014}, + journal = {{The R Journal}}, + doi = {10.32614/RJ-2014-011}, + url = {https://doi.org/10.32614/RJ-2014-011}, + pages = {111--122}, + volume = {6}, + number = {1} +} +``` + +## Functionality The package offers the following main functions: @@ -19,6 +42,7 @@ The package offers the following main functions: * `stringsim` computes a string similarity between 0 and 1, based on `stringdist` * `amatch` is a fuzzy matching equivalent of R's native `match` function * `ain` is a fuzzy matching equivalent of R's native `%in%` operator +* `afind` finds the location of fuzzy matches of a short string in a long string. * `seq_dist`, `seq_distmatrix`, `seq_amatch` and `seq_ain` for distances between, and matching of integer sequences. (see also the [hashr](https://github.com/markvanderloo/hashr) package). These functions are built upon `C`-code that re-implements some common (weighted) string @@ -42,7 +66,7 @@ Also, there are some utility functions: * `phonetic()` computes phonetic codes of strings (currently only soundex) * `printable_ascii()` is a utility function that detects non-printable ascii or non-ascii characters. -#### C api +#### C API As of version `0.9.5.0` you can call a number of `stringdist` functions directly from the `C` code of your R package. The description of the API can be found @@ -55,6 +79,8 @@ from the `C` code of your R package. The description of the API can be found system.file("doc/stringdist_api.pdf", package="stringdist") ``` +Examples of packages that link to `stringdist` can be found [here](https://github.com/markvanderloo/linkstringdist) and +[here](https://github.com/ChrisMuir/refinr). @@ -86,6 +112,15 @@ of the code is written in `C`, the development version may crash your `R`-sessio * A [paper](http://journal.r-project.org/archive/2014-1/loo.pdf) on stringdist has been published in the R-journal * [Slides](http://www.slideshare.net/MarkVanDerLoo/stringdist-use-r2014) of te _useR!2014_ conference. +#### Note to users: deprecated arguments removed as of version 0.9.5.0 + +The following arguments have been obsolete since 2015 and have been removed in the 0.9.5.0 release (spring 2018) + +* Argument `cluster` for function `stringdistmatrix`. +* Argument `maxDist` for functions `stringdist` and `stringdistmatrix` (not `amatch`). +* Argument `ncores` for function `stringdistmatrix` + + #### Note to users: deprecated arguments as of >= 0.9.0, >= 0.9.2 Parallelization used to be based on R's ```parallel``` package, that works by spawning several R sessions in the background. As of version 0.9.0, ```stringdist``` uses the more efficient ```openMP``` protocol to parallelize everything under the hood. diff --git a/build.sh b/build.sh deleted file mode 100755 index b9db284..0000000 --- a/build.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -R=R -CHECKARG="" -while [ $# -gt 0 ] ; do - case "$1" in - -dev) - R=Rdev - shift 1 ;; - *) - CHECKARG="$CHECKARG $1" - shift 1 ;; - esac -done - -echo "######## Removing building information..." -rm -rf output - - -echo "######## Generate documentation..." -./document.sh - - -echo "######## Building package in output..." -mkdir output -cd output -$R CMD build ../pkg -echo "######## Testing package with $CHECKARG ..." -for x in *.tar.gz -do - $R CMD check $CHECKARG $x -done - -echo "**BUILT USING $R" -$R --version - diff --git a/compile.sh b/compile.sh index 02a2af7..bf18713 100755 --- a/compile.sh +++ b/compile.sh @@ -2,7 +2,7 @@ cd pkg/src rm --verbose *.o *.so -gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -fpic -fopenmp -O2 -Wall -pipe -g -c *.c +gcc-10 -std=gnu99 -I/usr/share/R/include -DNDEBUG -fpic -fopenmp -O4 -Wall -pipe -g -c -Wstrict-prototypes -Wformat *.c #gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -fpic -O2 -Wall -pipe -g -c *.c gcc -std=gnu99 -shared -o stringdist.so *.o -L/usr/lib/R/lib -lR cd ../../ diff --git a/document.sh b/document.sh deleted file mode 100755 index d87afee..0000000 --- a/document.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# document R code -R -f roxygen.R -R CMD Rd2pdf --force --no-preview -o manual.pdf ./pkg - -# document the C API -if ! [ -x "$(command -v doxygen)" ]; then - echo 'Warning: Doxygen is not installed. Exiting' >&2 - exit 0 -fi - - -basedir=`pwd` -cd pkg/inst/include -doxygen Doxyfile -cd $basedir -cd pkg/inst/doc/latex -make -cd .. -mv latex/refman.pdf ./stringdist_api.pdf -rm -rf latex -cd $basedir - - diff --git a/examples/seq_sim.R b/examples/seq_sim.R index 4be8b49..46924af 100644 --- a/examples/seq_sim.R +++ b/examples/seq_sim.R @@ -1,6 +1,3 @@ -L1 <- list(1:3,2:4) -L2 <- list(1:3) -seq_sim(L1,L2,method="osa") # note how missing values are handled (L2 is recycled over L1) L1 <- list(c(1L,NA_integer_,3L),2:4,NA_integer_) diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION index 3bf9968..61ca970 100644 --- a/pkg/DESCRIPTION +++ b/pkg/DESCRIPTION @@ -1,29 +1,35 @@ Package: stringdist Maintainer: Mark van der Loo License: GPL-3 -Title: Approximate String Matching and String Distance Functions -LazyData: no +Title: Approximate String Matching, Fuzzy Text Search, and String Distance Functions Type: Package LazyLoad: yes -Authors@R: c( person("Mark", "van der Loo", role=c("aut","cre"),email="mark.vanderloo@gmail.com") +Authors@R: c( person("Mark", "van der Loo", role=c("aut","cre") + , email="mark.vanderloo@gmail.com" + , comment= c(ORCID="0000-0002-9807-4686")) , person("Jan", "van der Laan", role="ctb") , person("R Core Team","" , role="ctb") , person("Nick","Logan" , role="ctb") - , person("Chris","Muir" , role="ctb")) + , person("Chris","Muir" , role="ctb") + , person("Johannes", "Gruber" , role="ctb") + , person("Brian","Ripley" , role="ctb")) Description: Implements an approximate string matching version of R's native - 'match' function. Can calculate various string distances based on edits + 'match' function. Also offers fuzzy text search based on various string + distance measures. Can calculate various string distances based on edits (Damerau-Levenshtein, Hamming, Levenshtein, optimal sting alignment), qgrams (q- gram, cosine, jaccard distance) or heuristic metrics (Jaro, Jaro-Winkler). An implementation of soundex is provided as well. Distances can be computed between character vectors while taking proper care of encoding or between integer - vectors representing generic sequences. An API for C or C++ is exposed as well. -Version: 0.9.5.0 + vectors representing generic sequences. This package is built for speed and + runs in parallel by using 'openMP'. An API for C or C++ is exposed as well. + Reference: MPJ van der Loo (2014) . +Version: 0.9.16 Depends: R (>= 2.15.3) -Imports: - parallel URL: https://github.com/markvanderloo/stringdist BugReports: https://github.com/markvanderloo/stringdist/issues Suggests: - testthat -RoxygenNote: 6.0.1 + tinytest +Imports: parallel +Encoding: UTF-8 +RoxygenNote: 7.3.2 diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE index 3dd465d..53ebaad 100644 --- a/pkg/NAMESPACE +++ b/pkg/NAMESPACE @@ -1,7 +1,11 @@ # Generated by roxygen2: do not edit by hand +export(afind) export(ain) export(amatch) +export(extract) +export(grab) +export(grabl) export(phonetic) export(printable_ascii) export(qgrams) @@ -14,5 +18,6 @@ export(seq_sim) export(stringdist) export(stringdistmatrix) export(stringsim) +export(stringsimmatrix) importFrom(parallel,detectCores) useDynLib(stringdist, .registration=TRUE) diff --git a/pkg/NEWS b/pkg/NEWS index ed7ea3d..7d829cf 100644 --- a/pkg/NEWS +++ b/pkg/NEWS @@ -1,7 +1,105 @@ -version 0.9.4.7 +version 0.9.16 +- Fixed broken links in documentation +- Removed example causing spurious ASAN error on some systems. + +version 0.9.15 +- Fixe issue with zero-length 'nthreads' argument in all exported functions + with this parameter. (Thanks to Brian Ripley for the notification and pointer + to the problem) + +version 0.9.14 +- Fixed issue with zero-length strings in 'qgrams' (Thanks to Brian Ripley + for the notification and pointer to the origin of the problem) + +version 0.9.12 +- apparently R_xlen_t is long long int on CLANG/Windows and long int on gcc-13/debian + +version 0.9.11 +- Fixed a warning in gcc-13: changed specifier from %d to %ld. + (Thanks to Kurt Hornik for the head's up) + +version 0.9.10 +- Fixed another warning generated by new C compiler that I overlooked. + (Thanks to the CRAN team for the head's up) + +version 0.9.9 +- Fixed warnings generated by new C compiler. (function prototypes must + now be defined completely). (Thanks to Kurt Hornik for the head's up.) + +version 0.9.8 +- Fixed some issues on C-level causing problems with the + CLANG compiler. (Thanks to Brian Ripley for not only + reporting this, but also sending updated code with + fixes). + + +version 0.9.7 +- Fixes in use of INTEGER() and VECTOR_ELT() after updates in R's C API. + this affected 'afind' and 'max_length' (internally). (Thanks to Luke + Tierny and Kurt Hornik for the notification). +- Fix in 'amatch' causing utf-8 characters to be ignored in some + cases (thanks to Joan Mime for reporting #78). +- Fix: segfault when 'afind' was called with many search patterns or many + texts to be searched. +- Fix: stringsimmatrix was not normalized correctly (Thanks to Tamas Ferenci + for reporting GH). + + +version 0.9.6.3 +- Resubmit. Fixed an URL redirect that was detected by CRAN. + +version 0.9.6.2 +- Resubmit. Fixed url issues detected by CRAN, added doi to description + as per CRAN request. + +version 0.9.6.1 +- Bugfix: afind/grab/grabl returned wrong results on MacOS only. + (thanks to Prof. Brian Ripley for the notification and for running tests + on his personal machine and to Tomas Kalibera for making the + ubuntu-rchk docker image available). + +version 0.9.6 +- New function 'afind': find approximate matches in text based on string distance. +- New functions 'grab', 'grabl': fuzzy matching equivalent to 'grep' and 'grepl'. +- New function 'extract': fuzzy matching equivalent of stringr::str_extract. +- New algorithm 'running_cosine': fast fuzzy text search using cosine distance. +- New function 'stringsimmatrix' (Thanks to Johannes Gruber). +- Number of threads used is now reported when loading 'stringdist'. +- Internal fixes (in some cases class() == 'class' was used). + +version 0.9.5.5 +- Changed two URLs to canonical form in README.md (https://) to comply with + CRAN policy. + +version 0.9.5.4 +- Some tests using seq_dist() would fail unpredictably when the input was + defined with lazily evaluated arguments, e.g. list(1:3, 2:4); but only in the + context of NSE by a test suite ('tinytest', 'testthat'). Tests were replaced by + literal versions, e.g. list(c(1,2,3), c(2,3,4)). + +version 0.9.5.3 +- Update in test suite to stay on CRAN + +version 0.9.5.2 +- RJournal paper and C/C++ api docs are now presented as vignette. +- Switched to tinytest framework +- Fix: stringdist could cause a segfault for edit distances between very long + strings. (Thanks to GH user gllipatz) + + +version 0.9.5.1 +- Fixed header file for C API + +version 0.9.5.0 +- New contributor: Chris Muir +- C/C++ API now exposed for packages LinkingTo stringdist. See `?stringdist_api` +- Arguments 'maxDist', 'ncores', 'cluster' of functions 'stringdist' and + 'stringdistmatrix' have been deprecated for several years and are now + removed. - Fixed edge case where cosine distance with q=1, between strings of repeating characters yielded Inf (Thanks to Markus Dumke) + version 0.9.4.6 - Fixed argument passing error in lower_tri (thanks to Kurt Hornik) diff --git a/pkg/R/afind.R b/pkg/R/afind.R new file mode 100644 index 0000000..56ebc92 --- /dev/null +++ b/pkg/R/afind.R @@ -0,0 +1,192 @@ +#' Stringdist-based fuzzy text search +#' +#' \code{afind} slides a window of fixed width over a string \code{x} and +#' computes the distance between the each window and the sought-after +#' \code{pattern}. The location, content, and distance corresponding to the +#' window with the best match is returned. +#' +#' +#' @param x strings to search in +#' @param pattern strings to find (not a regular expression). For \code{grab}, +#' \code{grabl}, and \code{extract} this must be a single string. +#' @param window width of moving window. +#' @param value toggle return matrix with matched strings. +#' @inheritParams amatch +#' +#' @details +#' Matching is case-sensitive. Both \code{x} and \code{pattern} are converted +#' to \code{UTF-8} prior to search, unless \code{useBytes=TRUE}, in which case +#' the distances are measured bytewise. +#' +#' Code is parallelized over the \code{x} variable: each value of \code{x} +#' is scanned for every element in \code{pattern} using a separate thread (when \code{nthread} +#' is larger than 1). +#' +#' The functions \code{grab} and \code{grabl} are approximate string matching +#' functions that somewhat resemble base R's \code{\link[base]{grep}} and +#' \code{\link[base:grep]{grepl}}. They are implemented as convenience wrappers +#' of \code{afind}. +#' +#' @section Running cosine distance: +#' This algorithm gains efficiency by using that two consecutive windows have +#' a large overlap in their q-gram profiles. It gives the same result as +#' the \code{"cosine"} distance, but much faster. +#' +#' +#' @return +#' For \code{afind}: a \code{list} of three matrices, each with +#' \code{length(x)} rows and \code{length(pattern)} columns. In each matrix, +#' element \eqn{(i,j)} corresponds to \code{x[i]} and \code{pattern[j]}. The +#' names and description of each matrix is as follows. +#' \itemize{ +#' \item{\code{location}. \code{[integer]}, location of the start of best matching window. +#' When \code{useBytes=FALSE}, this corresponds to the location of a \code{UTF} code point +#' in \code{x}, possibly after conversion from its original encoding.} +#' \item{\code{distance}. \code{[character]}, the string distance between pattern and +#' the best matching window.} +#' \item{\code{match}. \code{[character]}, the first, best matching window.} +#' +#' } +#' +#' @family matching +#' +#' @examples +#' texts = c("When I grow up, I want to be" +#' , "one of the harvesters of the sea" +#' , "I think before my days are gone" +#' , "I want to be a fisherman") +#' patterns = c("fish", "gone","to be") +#' +#' afind(texts, patterns, method="running_cosine", q=3) +#' +#' grabl(texts,"grew", maxDist=1) +#' extract(texts, "harvested", maxDist=3) +#' +#' +#' @export +afind <- function(x, pattern, window=NULL + , value=TRUE + , method = c("osa","lv","dl","hamming","lcs", "qgram","cosine","running_cosine","jaccard","jw","soundex") + , useBytes = FALSE + , weight=c(d=1,i=1,s=1,t=1) + , q = 1 + , p = 0 + , bt = 0 + , nthread = getOption("sd_num_thread") + ){ + + stopifnot( + all(is.finite(weight)) + , all(weight > 0) + , all(weight <=1) + , is.null(window) || window >= 1 + , q >= 0 + , p <= 0.25 + , p >= 0 + , is.logical(useBytes) && !is.na(useBytes) + , is.logical(value) && !is.na(value) + , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) + , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) + , length(nthread) == 1 + , is.numeric(nthread) + , nthread > 0 + ) + + x <- as.character(x) + pattern <- as.character(pattern) + if ( !useBytes ){ + x <- enc2utf8(x) + pattern <- enc2utf8(pattern) + } + + if (is.null(window)){ + window = nchar(pattern, type = if (useBytes) "bytes" else "char") + } + + if (length(x) == 0) return(numeric(0)) + + method <- match.arg(method) + if (method == 'jw') weight <- weight[c(2,1,3)] + + + method <- METHODS[method] + if ( is.na(method) ){ + stop(sprintf("method '%s' is not defined",method)) + } + + L <- .Call("R_afind" + , x + , pattern + , as.integer(window) + , method + , as.double(weight) + , as.double(p) + , as.double(bt) + , as.integer(q) + , as.integer(useBytes) + , as.integer(nthread) + , PACKAGE="stringdist") + + names(L) <- c("location", "distance") + + if (isTRUE(value)){ + matches = sapply(seq_along(pattern), function(i){ + substr(x, L[[1]][,i], L[[1]][,i] + window[i]-1) + }) + L$match <- matrix(matches, nrow=length(x)) + } + + L +} + + + + +#' @rdname afind +#' @param ... passed to \code{afind}. +#' @param maxDist Only windows with distance \code{<= maxDist} are considered a match. +#' @return +#' For \code{grab}, an \code{integer} vector, indicating in which elements of +#' \code{x} a match was found with a distance \code{<= maxDist}. The matched +#' values when \code{value=TRUE} (equivalent to \code{\link[base]{grep}}). +#' @export +grab <- function(x, pattern, maxDist=Inf, value=FALSE, ...){ + stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) + L <- afind(x, pattern, value=value, ...) + if (!value){ + which(L$distance <= maxDist) + } else { + L$match[L$distance <= maxDist ] + } +} + +#' @rdname afind +#' @param ... passed to \code{afind}. +#' @return +#' For \code{grabl}, a \code{logical} vector, indicating in which elements of +#' \code{x} a match was found with a distance \code{<= maxDist}. (equivalent +#' to \code{\link[base:grep]{grepl}}). +#' @export +grabl <- function(x, pattern, maxDist=Inf, ...){ + stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) + L <- afind(x, pattern, value=FALSE, ...) + as.logical(L$distance <= maxDist) +} + + +#' @rdname afind +#' +#' @return +#' For \code{extract}, a \code{character} matrix with \code{length(x)} rows and +#' \code{length(pattern)} columns. If match was found, element \eqn{(i,j)} +#' contains the match, otherwise it is set to \code{NA}. +#' @export +extract <- function(x, pattern, maxDist = Inf, ...){ + stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) + L <- afind(x, pattern, value=TRUE, ...) + out <- L$match + out[L$distance > maxDist] <- NA_character_ + out +} + + diff --git a/pkg/R/amatch.R b/pkg/R/amatch.R index 4b1d705..89248d2 100644 --- a/pkg/R/amatch.R +++ b/pkg/R/amatch.R @@ -60,9 +60,9 @@ #' #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'}, #' or \code{'cosine'}. -#' @param p Winklers penalty parameter for Jaro-Winkler distance, with +#' @param p Winklers 'prefix' parameter for Jaro-Winkler distance, with #' \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'} -#' @param bt Winkler's boost threshold. Winkler's penalty factor is +#' @param bt Winkler's boost threshold. Winkler's prefix factor is #' only applied when the Jaro distance is larger than \code{bt}. #' Applies only to \code{method='jw'} and \code{p>0}. #' @@ -72,6 +72,8 @@ #' \code{logical} vector of length \code{length(x)} indicating wether an #' element of \code{x} approximately matches an element in \code{table}. #' +#' @family matching +#' #' @example ../examples/amatch.R #' @export amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE @@ -102,6 +104,8 @@ amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE , is.logical(useBytes) , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) + , length(nthread) == 1 + , is.numeric(nthread) , nthread > 0 ) if (method == 'jw') weight <- weight[c(2,1,3)] @@ -115,6 +119,7 @@ amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE , as.double(weight), as.double(p), as.double(bt) , as.integer(q) , as.double(maxDist), as.integer(useBytes) , as.integer(nthread) + , PACKAGE="stringdist" ) } @@ -174,9 +179,9 @@ ain <- function(x,table,...){ #' #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'}, #' or \code{'cosine'}. -#' @param p Winklers penalty parameter for Jaro-Winkler distance, with +#' @param p Winkler's prefix parameter for Jaro-Winkler distance, with #' \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'} -#' @param bt Winkler's boost threshold. Winkler's penalty factor is +#' @param bt Winkler's boost threshold. Winkler's prefix factor is #' only applied when the Jaro distance is larger than \code{bt}. #' Applies only to \code{method='jw'} and \code{p>0}. #' @return \code{seq_amatch} returns the position of the closest match of \code{x} @@ -209,6 +214,8 @@ seq_amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE , matchNA %in% c(TRUE,FALSE) , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) + , length(nthread) == 1 + , is.numeric(nthread) , nthread > 0 ) if (method == 'jw') weight <- weight[c(2,1,3)] @@ -222,6 +229,7 @@ seq_amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE , as.double(weight), as.double(p), as.double(bt) , as.integer(q) , as.double(maxDist), 0L , as.integer(nthread) + , PACKAGE="stringdist" ) } diff --git a/pkg/R/doc_metrics.R b/pkg/R/doc_metrics.R index c91ab99..6e53885 100644 --- a/pkg/R/doc_metrics.R +++ b/pkg/R/doc_metrics.R @@ -39,14 +39,14 @@ #' \code{qgram} \tab \eqn{q}-gram distance. \cr #' \code{cosine} \tab cosine distance between \eqn{q}-gram profiles \cr #' \code{jaccard} \tab Jaccard distance between \eqn{q}-gram profiles \cr -#' \code{jw} \tab Jaro, or Jaro-Winker distance.\cr +#' \code{jw} \tab Jaro, or Jaro-Winkler distance.\cr #' \code{soundex} \tab Distance based on soundex encoding (see below) #' } #' #' #' @section A short description of string metrics supported by \pkg{stringdist}: #' -#' See \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{Van der Loo +#' See \href{https://journal.r-project.org/articles/RJ-2014-011/}{Van der Loo #' (2014)} for an extensive description and references. The review papers of #' Navarro (2001) and Boytsov (2011) provide excellent technical overviews of #' respectively online and offline string matching algorithms. @@ -81,9 +81,9 @@ #' The computation is aborted when \code{q} is is larger than the length of #' any of the strings. In that case \code{Inf} is returned. #' -#' The \bold{cosine distance} (method='cosine') is computed as \eqn{1-x\cdot y/(\|x\|\|y\|)}, where \eqn{x} and -#' \eqn{y} were defined above. -#' +#' The \bold{cosine distance} (method='cosine') is computed as \eqn{1-x\cdot +#' y/(\|x\|\|y\|)}, where \eqn{x} and \eqn{y} were defined above. +#' #' Let \eqn{X} be the set of unique \eqn{q}-grams in \code{a} and \eqn{Y} the set of unique #' \eqn{q}-grams in \code{b}. The \bold{Jaccard distance} (\code{method='jaccard'}) is given by \eqn{1-|X\cap Y|/|X\cup Y|}. #' @@ -108,7 +108,7 @@ #' \eqn{d} is the Jaro-distance. Here, \eqn{l} is obtained by counting, from #' the start of the input strings, after how many characters the first #' character mismatch between the two strings occurs, with a maximum of four. The -#' factor \eqn{p} is a penalty factor, which in the work of Winkler is often +#' factor \eqn{p} is a 'prefix' factor, which in the work of Winkler is often #' chosen \eqn{0.1}. #' #' For the \bold{soundex} distance (method='soundex'), strings are translated to a soundex code @@ -118,6 +118,16 @@ #' in the ranges a-z and A-Z. A warning is emitted when non-printable or non-ascii #' characters are encountered. Also see \code{\link{printable_ascii}}. #' +#' The \bold{running_cosine} distance is an implementatation of the cosine +#' distance especially meant for fuzzy text search as in \code{\link{afind}}. +#' In fuzzy search a window of \code{n} characters slides across a (long) +#' string while for each position of the window the distance between the part +#' of the string in the window and a search pattern is computed. The (position +#' of) the window with the shortest distance to the search pattern is returned. +#' Sliding the window with a single position only affects the \eqn{q}-grams at +#' the beginning and end of the window, and the 'running cosine' distance uses +#' this and a few other tricks to save calculations. +#' #' #' #' @references diff --git a/pkg/R/doc_parallel.R b/pkg/R/doc_parallel.R index 4911763..af7bcc0 100644 --- a/pkg/R/doc_parallel.R +++ b/pkg/R/doc_parallel.R @@ -4,7 +4,8 @@ #' #' @description This page describes how \pkg{stringdist} uses parallel processing. #' -#' @section Multithreading and parallelization in \pkg{stringdist}: The core +#' @section Multithreading and parallelization in \pkg{stringdist}: +#' The core #' functions of \pkg{stringdist} are implemented in C. On systems where #' \code{openMP} is available, \pkg{stringdist} will automatically take #' advantage of multiple cores. The @@ -12,7 +13,7 @@ #' on OpenMP} of the #' \href{https://cran.r-project.org/doc/manuals/r-release/R-exts.html}{Writing #' R Extensions} manual discusses on what systems OpenMP is available (at the time of writing more or -#' less, anywhere except on OSX). +#' less anywhere except on OSX). #' #' By default, the number of threads to use is taken from \code{options('sd_num_thread')}. #' When the package is loaded, the value for this option is determined as follows: diff --git a/pkg/R/phonetic.R b/pkg/R/phonetic.R index 255ef1b..bbd7e54 100644 --- a/pkg/R/phonetic.R +++ b/pkg/R/phonetic.R @@ -17,7 +17,7 @@ #' not be trusted. If non-ascii or non-printable ascii charcters are encountered, a warning #' is emitted. #' -#' @seealso \code{\link{printable_ascii}}, \code{\link{stringdist-package}} +#' @seealso \code{\link{printable_ascii}} #' #' #' @return @@ -28,7 +28,7 @@ #' @references #' \itemize{ #' \item{The Soundex algorithm implemented is the algorithm used by the -#' \href{http://www.archives.gov/research/census/soundex.html}{National Archives}. +#' \href{https://www.archives.gov/research/census/soundex}{National Archives}. #' This algorithm differs slightly from the original algorithm patented by R.C. Russell #' (US patents 1261167 (1918) and 1435663 (1922)). #' } @@ -43,7 +43,7 @@ phonetic <- function(x, method = c("soundex"), useBytes = FALSE) { stopifnot(is.logical(useBytes)) if (!useBytes) x <- enc2utf8(x) if (method == "soundex") { - r <- .Call("R_soundex", x, useBytes) + r <- .Call("R_soundex", x, useBytes,PACKAGE="stringdist") if (!useBytes) int2char(r) else r } } diff --git a/pkg/R/qgrams.R b/pkg/R/qgrams.R index af63989..87d09ef 100644 --- a/pkg/R/qgrams.R +++ b/pkg/R/qgrams.R @@ -23,15 +23,24 @@ #' @example ../examples/qgrams.R #' @export qgrams <- function(..., .list=NULL,q=1L,useBytes=FALSE, useNames=!useBytes){ + stopifnot(is.numeric(q), length(q)==1, !is.na(q), q>=0) q <- as.integer(q) if (!is.null(.list) && length(.list) == 0) .list=NULL L <- lapply(c(list(...),.list), as.character) if (length(L) == 0) return(array(dim=c(0,0))) L <- setnames(L) + + if (q==0){ + return( matrix( sapply(L,function(x) sum(x=="")) + , ncol=1 + , dimnames = list(names(L), NULL)) ) + } + + L <- lapply(L,char2int) - v <- .Call("R_get_qgrams",L,as.integer(q)) + v <- .Call("R_get_qgrams",L,as.integer(q),PACKAGE="stringdist") nqgrams <- length(v)/length(L) qgrams <- NULL @@ -94,7 +103,7 @@ seq_qgrams <- function(...,.list=NULL,q=1L){ L <- lapply(c(list(...),.list),function(x) list(as.integer(x))) if (length(L) == 0) return(array(dim=c(0,0))) L <- setnames(L) - v <- .Call("R_get_qgrams",L,as.integer(q)) + v <- .Call("R_get_qgrams",L,as.integer(q), PACKAGE="stringdist") Q <- attr(v,"qgrams") nqgrams <- length(v)/length(L) Q <- t(array(Q,dim=c(q,nqgrams),dimnames=list(paste0("q",1:q),NULL))) diff --git a/pkg/R/seqdist.R b/pkg/R/seqdist.R index 1764849..0f5a2ab 100644 --- a/pkg/R/seqdist.R +++ b/pkg/R/seqdist.R @@ -24,10 +24,10 @@ #' \code{'Jaccard'}, or \code{'lcs'} #' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to #' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}. -#' @param p Penalty factor for Jaro-Winkler distance. The valid range for +#' @param p Prefix factor for Jaro-Winkler distance. The valid range for #' \code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the #' Jaro-distance is returned. Applies only to \code{method='jw'}. -#' @param bt Winkler's boost threshold. Winkler's penalty factor is +#' @param bt Winkler's boost threshold. Winkler's prefix factor is #' only applied when the Jaro distance is larger than \code{bt} #' Applies only to \code{method='jw'} and \code{p>0}. #' @param nthread Maximum number of threads to use. By default, a sensible @@ -69,6 +69,8 @@ seq_dist <- function(a, b , p >= 0 , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) + , length(nthread) == 1 + , is.numeric(nthread) , nthread > 0 ) diff --git a/pkg/R/stringdist.R b/pkg/R/stringdist.R index c7d806e..91df875 100644 --- a/pkg/R/stringdist.R +++ b/pkg/R/stringdist.R @@ -1,6 +1,5 @@ #' A package for string distance calculation and approximate string matching. #' -#' @section Introduction: #' #' The \pkg{stringdist} package offers fast and platform-independent string #' metrics. Its main purpose is to compute various string distances and to do @@ -57,7 +56,7 @@ #' \item{The code for soundex conversion and string similarity was kindly contributed by Jan van der Laan.} #' } #' @section Citation: -#' If you would like to cite this package, please cite the \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{R Journal Paper}: +#' If you would like to cite this package, please cite the \href{https://journal.r-project.org/articles/RJ-2014-011/}{R Journal Paper}: #' \itemize{ #' \item{M.P.J. van der Loo (2014). The \code{stringdist} package for approximate string matching. #' R Journal 6(1) pp 111-122} @@ -71,7 +70,7 @@ #' #' #' -{} +"_PACKAGE" listwarning <- function(x,y){ sprintf(" @@ -107,22 +106,18 @@ This warning can be avoided by explicitly converting the argument(s). #' Weights must be positive and not exceed 1. \code{weight} is ignored #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'}, #' \code{'Jaccard'}, \code{'lcs'}, or \code{soundex}. -#' @param maxDist [DEPRECATED AND WILL BE REMOVED|2016] Currently kept for -#' backward compatibility. It does not offer any speed gain. (In fact, it -#' currently slows things down when set to anything different from -#' \code{Inf}). #' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to #' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}. -#' @param p Penalty factor for Jaro-Winkler distance. The valid range for +#' @param p Prefix factor for Jaro-Winkler distance. The valid range for #' \code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the #' Jaro-distance is returned. Applies only to \code{method='jw'}. -#' @param bt Winkler's boost threshold. Winkler's penalty factor is +#' @param bt Winkler's boost threshold. Winkler's prefix factor is #' only applied when the Jaro distance is larger than \code{bt}. #' Applies only to \code{method='jw'} and \code{p>0}. #' @param nthread Maximum number of threads to use. By default, a sensible #' number of threads is chosen, see \code{\link{stringdist-parallelization}}. #' -#' @seealso \code{\link{stringsim}}, \code{\link{qgrams}}, \code{\link{amatch}} +#' @seealso \code{\link{stringsim}}, \code{\link{qgrams}}, \code{\link{amatch}}, \code{\link{afind}} #' #' @return For \code{stringdist}, a vector with string distances of size #' \code{max(length(a),length(b))}. @@ -143,11 +138,11 @@ stringdist <- function(a, b , method=c("osa","lv","dl","hamming","lcs", "qgram","cosine","jaccard","jw","soundex") , useBytes = FALSE , weight=c(d=1,i=1,s=1,t=1) - , maxDist=Inf, q=1, p=0, bt=0 + , q = 1 + , p = 0 + , bt = 0 , nthread = getOption("sd_num_thread") ){ - if (maxDist < Inf) - warning("Argument 'maxDist' is deprecated for function 'stringdist'. This argument will be removed in the future.") if (is.list(a)|is.list(b)) warning(listwarning("stringdist","seq_dist")) @@ -161,6 +156,8 @@ stringdist <- function(a, b , is.logical(useBytes) , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) + , length(nthread) == 1 + , is.numeric(nthread) , nthread > 0 ) @@ -185,7 +182,6 @@ stringdist <- function(a, b do_dist(a=b, b=a , method=method , weight=weight - , maxDist=maxDist , q=q , p=p , bt=bt @@ -195,31 +191,20 @@ stringdist <- function(a, b #' @param useNames Use input vectors as row and column names? -#' @param ncores [DEPRECATED AND WILL BE REMOVED|2016]. Use \code{nthread} in -#' stead. This argument is ignored. -#' @param cluster [DEPRECATED AND WILL BE REMOVED|2016]. A custom cluster, -#' created with \code{\link[parallel]{makeCluster}}. #' #' #' @rdname stringdist #' @export -#' @rdname stringdist stringdistmatrix <- function(a, b , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw","soundex") , useBytes = FALSE - , weight=c(d=1,i=1,s=1,t=1), maxDist=Inf, q=1, p=0, bt=0 - , useNames=c('none','strings','names'), ncores=1, cluster=NULL + , weight=c(d=1,i=1,s=1,t=1) + , q = 1 + , p = 0 + , bt = 0 + , useNames=c('none','strings','names') , nthread = getOption("sd_num_thread") ){ - if (maxDist < Inf) - warning("Argument 'maxDist' is deprecated for function 'stringdistmatrix'. This argument will be removed in the future.") - if (ncores > 1 ){ - warning("Argument 'ncores' is deprecated as stringdist now uses multithreading by default. This argument is currently ignored and will be removed in the future.") - ncores <- 1 - } - if ( !is.null(cluster) ){ - message("Argument 'cluster' is deprecaterd as stringdust now uses multithreading by default. The argument is currently ignored and will be removed in the future") - } if (is.list(a)|| (!missing(b) && is.list(b)) ){ warning(listwarning("stringdistmatrix","seq_distmatrix")) } @@ -241,7 +226,8 @@ stringdistmatrix <- function(a, b , is.logical(useBytes) , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) - , ncores > 0 + , length(nthread) == 1 + , is.numeric(nthread) , nthread > 0 ) @@ -291,7 +277,7 @@ stringdistmatrix <- function(a, b } x <- vapply(b, do_dist, USE.NAMES=FALSE, FUN.VALUE=numeric(length(a)) - , a, method,weight,maxDist, q, p, bt, useBytes, nthread) + , a, method,weight, q, p, bt, useBytes, nthread) if (useNames %in% c("strings","names") ){ structure(matrix(x,nrow=length(a),ncol=length(b), dimnames=list(rowns,colns))) @@ -322,10 +308,11 @@ METHODS <- c( , jaccard = 7L , jw = 8L , soundex = 9L + , running_cosine = 10L ) -do_dist <- function(a, b, method, weight, maxDist=Inf, q, p, bt, useBytes=FALSE, nthread=1L){ +do_dist <- function(a, b, method, weight, q, p, bt, useBytes=FALSE, nthread=1L){ if (method=='soundex' && !all(printable_ascii(a) & printable_ascii(b)) ){ warning("Non-printable ascii or non-ascii characters in soundex. Results may be unreliable. See ?printable_ascii.") @@ -338,11 +325,9 @@ do_dist <- function(a, b, method, weight, maxDist=Inf, q, p, bt, useBytes=FALSE, d <- .Call("R_stringdist", a, b, method , as.double(weight), as.double(p), as.double(bt), as.integer(q) , as.integer(useBytes), as.integer(nthread) + , PACKAGE="stringdist" ) - if (maxDist < Inf ){ - d[!is.na(d) & d > maxDist] <- Inf - } d } @@ -361,10 +346,11 @@ lower_tri <- function(a if (is.na(method)){ stop(sprintf("method '%s' is not defined",method)) } - + x <- .Call("R_lower_tri", a, methnr , as.double(weight), as.double(p), as.double(bt) - , as.integer(q), as.integer(useBytes), as.integer(nthread)) + , as.integer(q), as.integer(useBytes), as.integer(nthread) + , PACKAGE="stringdist") attributes(x) <- list(class='dist' , Size = length(a) diff --git a/pkg/R/stringsim.R b/pkg/R/stringsim.R index 3f7a844..74add2b 100644 --- a/pkg/R/stringsim.R +++ b/pkg/R/stringsim.R @@ -3,6 +3,8 @@ #' \code{stringsim} computes pairwise string similarities between elements of #' \code{character} vectors \code{a} and \code{b}, where the vector with less #' elements is recycled. +#' \code{stringsimmatrix} computes the string similarity matrix with rows +#' according to \code{a} and columns according to \code{b}. #' #' @param a R object (target); will be converted by \code{as.character}. #' @param b R object (source); will be converted by \code{as.character}. @@ -11,14 +13,16 @@ #' @param useBytes Perform byte-wise comparison, see \code{\link{stringdist-encoding}}. #' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to #' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}. -#' @param ... additional arguments are passed on to \code{\link{stringdist}}. -#' +#' @param ... additional arguments are passed on to \code{\link{stringdist}} and +#' \code{\link{stringdistmatrix}} respectively. #' @return -#' Returns a vector with similarities, which are values between 0 and 1 where -#' 1 corresponds to perfect similarity (distance 0) and 0 to complete -#' dissimilarity. \code{NA} is returned when \code{\link{stringdist}} returns -#' \code{NA}. Distances equal to \code{Inf} are truncated to a similarity of -#' 0. +#' \code{stringsim} returns a vector with similarities, which are values between +#' 0 and 1 where 1 corresponds to perfect similarity (distance 0) and 0 to +#' complete dissimilarity. \code{NA} is returned when \code{\link{stringdist}} +#' returns \code{NA}. Distances equal to \code{Inf} are truncated to a +#' similarity of 0. \code{stringsimmatrix} works the same way but, equivalent to +#' \code{\link{stringdistmatrix}}, returns a similarity matrix instead of a +#' vector. #' #' @details #' The similarity is calculated by first calculating the distance using @@ -45,6 +49,23 @@ stringsim <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs", } +#' @rdname stringsim +#' @export +stringsimmatrix <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs", + "qgram", "cosine", "jaccard", "jw", "soundex"), useBytes=FALSE, q = 1, ...) { + # Calculate the distance + method <- match.arg(method) + nctype <- if (useBytes) "bytes" else "char" + if (missing(b)){ + dist <- stringdist::stringdistmatrix(a, method=method, useBytes=useBytes, q=q, ...) + normalize_dist(dist, a= rep(a,length(a)), b = rep(a,each=length(a)), method=method, nctype=nctype, q=q) + } else { + dist <- stringdist::stringdistmatrix(a, b, method=method, useBytes=useBytes, q=q, ...) + normalize_dist(dist, a=rep(a,length(b)), b=rep(b,each=length(a)), method=method, nctype=nctype, q=q) + } +} + + #' Compute similarity scores between sequences of integers #' #' @param a \code{list} of \code{integer} vectors (target) @@ -67,10 +88,10 @@ stringsim <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs", #' @export seq_sim <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw"), q = 1, ...) { - + method <- match.arg(method) dist <- stringdist::seq_dist(a, b, method=method, q=q, ...) - normalize_dist(dist,a,b,method=method,q=q) + normalize_dist(dist, a, b, method=method, q=q) } @@ -86,11 +107,13 @@ lengths.character <- function(x, type="char",...){ } lengths.list <- function(x,...){ - .Call("R_lengths",x) + .Call("R_lengths",x, PACKAGE="stringdist") } normalize_dist <- function(dist, a, b, method, nctype="char",q=1L){ + if (inherits(dist, "dist")) dist <- as.matrix(dist) + # Normalise the distance by dividing it by the maximum possible distance if (method == "hamming") { max_dist <- if (length(b) > length(a)) lengths(b,type=nctype) else lengths(a,type=nctype) diff --git a/pkg/R/utils.R b/pkg/R/utils.R index 1296abc..7553b5c 100644 --- a/pkg/R/utils.R +++ b/pkg/R/utils.R @@ -27,7 +27,7 @@ mymsg <- message if ( is.na(omp_thread_limit) ) omp_thread_limit <- nthread nthread <- min(omp_thread_limit,nthread) - options(sd_num_thread=nthread) + options(sd_num_thread=as.integer(nthread)) } # When necessary and possible, argument is coverted to integers. @@ -84,7 +84,7 @@ printable_ascii <- function(x){ # check whether all elements of a list are of type 'integer'. # x MUST be a list. all_int <- function(x){ - .Call("R_all_int",x) + .Call("R_all_int",x,PACKAGE="stringdist") } diff --git a/pkg/README.md b/pkg/README.md new file mode 100644 index 0000000..5560f32 --- /dev/null +++ b/pkg/README.md @@ -0,0 +1,58 @@ +[![Mentioned in Awesome Official Statistics ](https://awesome.re/mentioned-badge.svg)](https://github.com/SNStatComp/awesome-official-statistics-software) + +## stringdist + +* Approximate matching and string distance calculations for R. +* All distance and matching operations are system- and encoding-independent. +* Built for speed, using [openMP](https://www.openmp.org/) for parallel computing. + +The package offers the following main functions: + +* `stringdist` computes pairwise distances between two input character vectors (shorter one is recycled) +* `stringdistmatrix` computes the distance matrix for one or two vectors +* `stringsim` computes a string similarity between 0 and 1, based on `stringdist` +* `amatch` is a fuzzy matching equivalent of R's native `match` function +* `ain` is a fuzzy matching equivalent of R's native `%in%` operator +* `seq_dist`, `seq_distmatrix`, `seq_amatch` and `seq_ain` for distances between, and matching of integer sequences. + +These functions are built upon `C`-code that re-implements some common (weighted) string +distance functions. Distance functions include: + +* Hamming distance; +* Levenshtein distance (weighted) +* Restricted Damerau-Levenshtein distance (weighted, a.k.a. Optimal String Alignment) +* Full Damerau-Levenshtein distance +* Longest Common Substring distance +* Q-gram distance +* cosine distance for q-gram count vectors (= 1-cosine similarity) +* Jaccard distance for q-gram count vectors (= 1-Jaccard similarity) +* Jaro, and Jaro-Winkler distance +* Soundex-based string distance + +Also, there are some utility functions: + +* `qgrams()` tabulates the qgrams in one or more `character` vectors. +* `seq_qrams()` tabulates the qgrams (somtimes called ngrams) in one or more `integer` vectors. +* `phonetic()` computes phonetic codes of strings (currently only soundex) +* `printable_ascii()` is a utility function that detects non-printable ascii or non-ascii characters. + +#### C API + +Some of `stringdist`'s underlying `C` functions can be called directly from +`C` code in other packages. The description of the API can be found by either +typing `?stringdist_api` in the R console or open the vignette directly as follows: + +``` +vignette("stringdist_C-Cpp_api", package="stringdist") +``` + +Examples of packages that link to `stringdist` can be found +[here](https://github.com/markvanderloo/linkstringdist) and +[here](https://github.com/ChrisMuir/refinr). + + +#### Resources + +* A [paper](https://journal.r-project.org/articles/RJ-2014-011/) on stringdist has been published in the R-journal +* [Slides](https://www.markvanderloo.eu/files/share/loo2014approximate.pdf) of a talk given at te _useR!2014_ conference. + diff --git a/pkg/inst/include/Doxyfile b/pkg/inst/include/Doxyfile index bfaf721..72ecfbe 100644 --- a/pkg/inst/include/Doxyfile +++ b/pkg/inst/include/Doxyfile @@ -58,7 +58,7 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = "../doc" +OUTPUT_DIRECTORY = "../../vignettes" # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and diff --git a/pkg/inst/include/stringdist_api.h b/pkg/inst/include/stringdist_api.h index c48773f..ae847d5 100644 --- a/pkg/inst/include/stringdist_api.h +++ b/pkg/inst/include/stringdist_api.h @@ -21,9 +21,6 @@ #ifndef _STRINGDIST_API_H #define _STRINGDIST_API_H -#include // also includes R.h, Rinternals.h, Rdefines.h - -#include #include #ifdef HAVE_VISIBILITY_ATTRIBUTE @@ -53,14 +50,22 @@ extern "C" { * ``` * * - * An example of a package using this API is [refinr](https://CRAN.R-project.org/package=refinr). - * + * An example of a published package using this API is + * [refinr](https://CRAN.R-project.org/package=refinr). A minimal example can be + * found [here](https://github.com/markvanderloo/linkstringdist). * * @section encoding Character encoding - * All `character` vector input is expected to be in `UTF-8` (this also allows `ASCII`). - * Distance computations are based on UTF [code points](https://en.wikipedia.org/wiki/Code_point) unless `useBytes` is `TRUE`, in which - * case distances are computed over byte sequences. Using non-UTF-8 encoded strings is - * untested and is highly likely to result in errors. + * All `character` vector input is expected to be in `UTF-8` (this also allows + * `ASCII`). Distance computations are based on UTF [code + * points](https://en.wikipedia.org/wiki/Code_point) unless `useBytes` is + * `TRUE`, in which case distances are computed over byte sequences. Using + * non-UTF-8 encoded strings is untested and is highly likely to result in + * errors. + * + * @section threads Thread safety + * + * It is not safe to call functions from `stringdist` C API from + * multiple concurrent threads. * * * @@ -102,9 +107,9 @@ SEXP attribute_hidden sd_all_int(SEXP X) * - 9: Soundex (`"soundex"`) * @endparblock * @param nomatch `[integer]` The value to be returned when no match is found. - * @param matchNA Should `NA`s be matched? Default behaviour mimics the - * behaviour of base `match`, meaning that `NA` matches - * `NA` (see also the note on `NA` handling below). + * @param matchNA Should `NA`s be matched? Default behaviour mimics the + * behaviour of base `match`, meaning that `NA` matches `NA` (see also the note + * on `NA` handling below). * @param weight `[numeric]` vector. Edit penalty * @parblock * For `method='osa'` or`'dl'`, the penalty for @@ -115,16 +120,17 @@ SEXP attribute_hidden sd_all_int(SEXP X) * Weights must be positive and not exceed 1. `weight` is ignored * completely for other methods * @endparblock - * @param q `[integer]` scalar. Size of the q-gram; must be nonnegative. Only applies to - * `method='qgram'`, `'jaccard'` or `'cosine'`. + * @param q `[integer]` scalar. Size of the q-gram; must be nonnegative. Only + * applies to `method='qgram'`, `'jaccard'` or `'cosine'`. * @param maxDistance `[numeric]` scalar. The maximum distance allowed for matching. - * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The valid range for - * `p` is `0 <= p <= 0.25`. If `p=0` (default), the + * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The + * valid range for `p` is `0 <= p <= 0.25`. If `p=0` (default), the * Jaro-distance is returned. Applies only to `method='jw'`. - * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty factor is - * only applied when the Jaro distance is larger than `bt`. - * Applies only to `method='jw'` and `p>0`. - * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to integer prior to distance calculation) + * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty + * factor is only applied when the Jaro distance is larger than `bt`. Applies + * only to `method='jw'` and `p>0`. + * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to + * integer prior to distance calculation) * @param nthread `[integer]` scalar. Maximum number of threads to use. * * @@ -149,9 +155,11 @@ SEXP attribute_hidden sd_amatch(SEXP x, SEXP table, SEXP method * @param qq `[integer`] scalar. * * @return - * A `[numeric]` vector of `length(a)*n_qgrams`, where `n_qrams` is the number of different `qgrams` observed - * in the elements of `a`. The output vector has an attribute called `qgrams`, which is an integer vector - * of size `q*n_qgrams` containing integer (UTF-32) labels for the q-grams sequentially. + * A `[numeric]` vector of `length(a)*n_qgrams`, where `n_qrams` is the number + * of different `qgrams` observed in the elements of `a`. The output vector has + * an attribute called `qgrams`, which is an integer vector of size + * `q*n_qgrams` containing integer (UTF-32) labels for the q-grams + * sequentially. * */ SEXP attribute_hidden sd_get_qgrams(SEXP a, SEXP qq) @@ -189,28 +197,29 @@ SEXP attribute_hidden sd_lengths(SEXP X) * @endparblock * @param weight `[numeric]` vector. Edit penalty * @parblock - * For `method='osa'` or`'dl'`, the penalty for - * deletion, insertion, substitution and transposition, in that order. When - * `method='lv'`, the penalty for transposition is ignored. When - * `method='jw'`, the weights associated with characters of `a`, - * characters from `b` and the transposition weight, in that order. - * Weights must be positive and not exceed 1. `weight` is ignored - * completely for other methods + * For `method='osa'` or`'dl'`, the penalty for deletion, insertion, + * substitution and transposition, in that order. When `method='lv'`, the + * penalty for transposition is ignored. When `method='jw'`, the weights + * associated with characters of `a`, characters from `b` and the + * transposition weight, in that order. Weights must be positive and not + * exceed 1. `weight` is ignored completely for other methods * @endparblock - * @param q `[integer]` scalar. Size of the q-gram; must be nonnegative. Only applies to - * `method='qgram'`, `'jaccard'` or `'cosine'`. - * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The valid range for - * `p` is `0 <= p <= 0.25`. If `p=0` (default), the + * @param q `[integer]` scalar. Size of the q-gram; must be nonnegative. Only + * applies to `method='qgram'`, `'jaccard'` or `'cosine'`. + * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The + * valid range for `p` is `0 <= p <= 0.25`. If `p=0` (default), the * Jaro-distance is returned. Applies only to `method='jw'`. - * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty factor is - * only applied when the Jaro distance is larger than `bt`. - * Applies only to `method='jw'` and `p>0`. - * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to integer prior to distance calculation) + * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty + * factor is only applied when the Jaro distance is larger than `bt`. Applies + * only to `method='jw'` and `p>0`. + * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to + * integer prior to distance calculation) * @param nthread `[integer]` scalar. Maximum number of threads to use. * * @return - * A `[numeric]` vector of length `n*(n-1)/2`, where `n=length(a)`. It contains the positive values of consequtive columns - * of the distance matrix. Also see the R-code in `stringdist:::lower_tri`. + * A `[numeric]` vector of length `n*(n-1)/2`, where `n=length(a)`. It contains + * the positive values of consequtive columns of the distance matrix. Also see + * the R-code in `stringdist:::lower_tri`. */ SEXP attribute_hidden sd_lower_tri(SEXP a, SEXP method , SEXP weight, SEXP p, SEXP bt, SEXP q @@ -229,7 +238,9 @@ SEXP attribute_hidden sd_lower_tri(SEXP a, SEXP method * * @return * - * A character vector of `length(x)` with soundex codes for elements of `x`. + * A `list` with `length(x)` element. Each element is a length 4 integer vector + * representing a 4-character soundex code. The integers are ASCII code points. + * */ SEXP attribute_hidden sd_soundex(SEXP x, SEXP useBytes) { @@ -258,29 +269,30 @@ SEXP attribute_hidden sd_soundex(SEXP x, SEXP useBytes) * @endparblock * @param weight `[numeric]` vector. Edit penalty * @parblock - * For `method='osa'` or`'dl'`, the penalty for - * deletion, insertion, substitution and transposition, in that order. When - * `method='lv'`, the penalty for transposition is ignored. When - * `method='jw'`, the weights associated with characters of `a`, - * characters from `b` and the transposition weight, in that order. - * Weights must be positive and not exceed 1. `weight` is ignored - * completely for other methods + * For `method='osa'` or`'dl'`, the penalty for deletion, insertion, + * substitution and transposition, in that order. When `method='lv'`, the + * penalty for transposition is ignored. When `method='jw'`, the weights + * associated with characters of `a`, characters from `b` and the + * transposition weight, in that order. Weights must be positive and not + * exceed 1. `weight` is ignored completely for other methods * @endparblock - * @param q `[integer]` scalar. Size of the q-gram; must be nonnegative. Only applies to - * `method='qgram'`, `'jaccard'` or `'cosine'`. - * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The valid range for - * `p` is `0 <= p <= 0.25`. If `p=0` (default), the + * @param q `[integer]` scalar. Size of the q-gram; must be nonnegative. Only + * applies to `method='qgram'`, `'jaccard'` or `'cosine'`. + * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The + * valid range for `p` is `0 <= p <= 0.25`. If `p=0` (default), the * Jaro-distance is returned. Applies only to `method='jw'`. - * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty factor is - * only applied when the Jaro distance is larger than `bt`. - * Applies only to `method='jw'` and `p>0`. - * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to integer prior to distance calculation) + * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty + * factor is only applied when the Jaro distance is larger than `bt`. Applies + * only to `method='jw'` and `p>0`. + * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to + * integer prior to distance calculation) * @param nthread `[integer]` scalar. Maximum number of threads to use. * * * @return - * A `[numeric]` vector of length `max(length(a),length(b))` where the shortest vector is recycled over the - * longer (no warnings are given when the longer length is not an integer multiple of the shorter length). + * A `[numeric]` vector of length `max(length(a),length(b))` where the shortest + * vector is recycled over the longer (no warnings are given when the longer + * length is not an integer multiple of the shorter length). * * */ diff --git a/pkg/inst/tinytest/test_afind.R b/pkg/inst/tinytest/test_afind.R new file mode 100644 index 0000000..4d1fa9b --- /dev/null +++ b/pkg/inst/tinytest/test_afind.R @@ -0,0 +1,127 @@ +options(sd_num_thread=1L) + +# tests against cases that used to segfault when we did not check +# NULL cases. +expect_error(afind("a","b",nthread=1:4)) +expect_error(afind("a","b",nthread="foo")) +expect_error(afind("a","b",nthread=integer(0))) +expect_error(afind("a","b",nthread=NULL)) + + + + + +texts = c("When I grow up, I want to be" + , "one of the harversters of the sea" + , "I think before my days are gone" + , "I want to be a fisherman") + +patterns = c("fish", "gone","to be") + +out <- afind(texts, patterns, method="osa") + +location <- matrix(c( + 1, 1, 24, + 6, 1, 28, + 1, 28, 6, + 16, 3, 8), + nrow=4, byrow=TRUE) + + +distance <- matrix(c( + 4, 3, 0, + 2, 2, 3, + 3, 0, 2, + 0, 3, 0), + nrow=4, byrow=TRUE) + +match <- matrix(c( + "When", "When", "to be", + "f th", "one ", "he se", + "I th", "gone", "nk be", + "fish", "want", "to be"), + nrow=4, byrow=TRUE) + + +expect_equal(out$location, location) +expect_equal(out$distance, distance) +expect_equal(out$match, match) + +# test paralellization + +out1 <- afind(texts, patterns, method="osa", nthread=2L) +expect_identical(out, out1) + +# test 'value' option +out2 <- afind(texts, patterns, value=FALSE) +expect_equal(length(out2), 2) + + +# test grep/grepl equivalents 'grab', 'grabl' + +expect_equal(grab(texts, "harvester", maxDist=2), 2) +expect_equal(grab(texts, "harvester", value=TRUE, maxDist=2), "harverste") +expect_equal(grabl(texts, "harvester", maxDist=2) + , c(FALSE,TRUE,FALSE,FALSE)) + +expect_equal(extract(texts, "harvester", maxDist=2) + , matrix(c(NA, "harverste",NA,NA),nrow=4) ) + +## Test running_cosine +pattern <- c("phish", "want to") + +expect_identical( + afind(texts, pattern, method="cosine", q=3) + , afind(texts, pattern, method="running_cosine", q=3) +) + + +## test whether the correct positions are returned for all methods. + +methods = names(stringdist:::METHODS) +methods = methods[!methods %in% c("soundex","hamming")] +text <- "If you squeeze my lizzard, I put my snake on you." +pattern <- "lizard" + +for ( method in methods ){ + expect_equal(afind(text, pattern, method=method, q=3, p=0.1)$location[1,1], 19, info=method) +} + +## test the usual edge cases + +# notice: window size = 0. +expect_equal(afind("foo","")$distance[1], 0) + +expect_equal(afind("foo",NA)$distance[1], NA_real_) +expect_equal(afind("foo",NA)$location[1], NA_integer_) +expect_equal(afind("foo",NA)$match[1], NA_character_) + +expect_equal(afind(NA,"foo")$distance[1], NA_real_) +expect_equal(afind(NA,"foo")$location[1], NA_integer_) +expect_equal(afind(NA,"foo")$match[1], NA_character_) + +expect_equal(afind("","foo")$distance[1], 3) +expect_equal(afind("","foo")$location[1], 1) +expect_equal(afind("","foo")$match[1], "") + +expect_equal(grab("foo", ""), 1L) +expect_equal(grabl("foo",""), TRUE) +expect_equal(grab("foo",NA), integer(0)) + +# note that 'grepl' gives FALSE in this case (which is inconsistent with +# grepl(NA, NA), grepl(NA, "foo"). +expect_equal(grabl("foo",NA), NA) + + + + + + + + + + + + + + diff --git a/pkg/tests/testthat/testAmatch.R b/pkg/inst/tinytest/test_amatch.R similarity index 87% rename from pkg/tests/testthat/testAmatch.R rename to pkg/inst/tinytest/test_amatch.R index c31adc0..7a020d0 100644 --- a/pkg/tests/testthat/testAmatch.R +++ b/pkg/inst/tinytest/test_amatch.R @@ -1,7 +1,14 @@ +options(sd_num_thread=2) +## amatch: Optimal String Alignment +# tests against cases that used to segfault when we did not check +# NULL cases. +expect_error(amatch("a","b",nthread=1:4)) +expect_error(amatch("a","b",nthread="foo")) +expect_error(amatch("a","b",nthread=integer(0))) +expect_error(amatch("a","b",nthread=NULL)) -context("amatch: Optimal String Alignment") -test_that("simple test and multiple edge cases",{ +## simple test and multiple edge cases expect_equal(amatch("aa",c("ba","bb"), method="osa",maxDist=1L), 1L) expect_equal(amatch("aa",c("bb","bb"), method="osa",maxDist=1L), NA_integer_) expect_equal(amatch("aa",c("bbb"), method="osa",maxDist=2L), NA_integer_) @@ -16,13 +23,13 @@ test_that("simple test and multiple edge cases",{ expect_equal(amatch("aa","bb", method="osa",maxDist=1), NA_integer_) expect_equal(amatch("aa","bb", method="osa",maxDist=1), NA_integer_) expect_equal(amatch(c("m","fem"),c("male","female"),method="osa",maxDist=Inf), c(1,2)) -}) -context("amatch: Damerau-Levenshtein") -test_that("simple test and multiple edge cases",{ +## amatch: Damerau-Levenshtein + +## simple test and multiple edge cases expect_equal(amatch("aa", c("ba","bb"), method="dl",maxDist=1L), 1L) expect_equal(amatch("aa",c("bb","bb"), method="dl",maxDist=1L), NA_integer_) expect_equal(amatch("aa",c("bbb"), method="dl",maxDist=2L), NA_integer_) @@ -37,11 +44,11 @@ test_that("simple test and multiple edge cases",{ expect_equal(amatch("aa","bb", method="dl",maxDist=1), NA_integer_) expect_equal(amatch("aa","bb", method="dl",maxDist=1), NA_integer_) expect_equal(amatch(c("m","fem"),c("male","female"),method="dl",maxDist=Inf), c(1,2)) -}) -context("amatch: Hamming") -test_that("simple test and multiple edge cases",{ +## amatch: Hamming + +## simple test and multiple edge cases expect_equal(amatch("aa", c("ba","bb"), method="hamming",maxDist=1L), 1L) expect_equal(amatch("aa",c("bb","bb"), method="hamming",maxDist=1L), NA_integer_) expect_equal(amatch(NA,c(NA,NA),method="hamming"),1L) @@ -53,12 +60,12 @@ test_that("simple test and multiple edge cases",{ expect_equal(amatch(NA,NA, method="hamming",matchNA=FALSE,nomatch=0L), 0L) expect_equal(amatch(NA,NA, method="hamming",matchNA=FALSE,nomatch=7L), 7L) expect_equal(amatch("aa","bb", method="hamming",maxDist=1), NA_integer_) -}) -context("amatch: Jaro and Jaro-Winkler") -test_that("simple test and multiple edge cases",{ +## amatch: Jaro and Jaro-Winkler + +## simple test and multiple edge cases expect_equal(amatch("aa", c("ba","bb"), method="jw",maxDist=1L), 1L) expect_equal(amatch("aa",c("bb","bb"), method="jw",maxDist=0.5), NA_integer_) expect_equal(amatch(NA,c(NA,NA),method="jw"),1L) @@ -70,11 +77,10 @@ test_that("simple test and multiple edge cases",{ expect_equal(amatch(NA,NA, method="jw",matchNA=FALSE,nomatch=0L), 0L) expect_equal(amatch(NA,NA, method="jw",matchNA=FALSE,nomatch=7L), 7L) expect_equal(amatch(c("m","fem"),c("male","female"),method="jw",maxDist=Inf), c(1,2)) -}) -context("amatch: Longest Common Substring") +## amatch: Longest Common Substring -test_that("simple test and multiple edge cases",{ +## simple test and multiple edge cases expect_equal(amatch("aa", c("ba","bb"), method="lcs",maxDist=2L), 1L) expect_equal(amatch("aa",c("bb","bb"), method="lcs",maxDist=1L), NA_integer_) expect_equal(amatch("aa",c("bbb"), method="lcs",maxDist=2L), NA_integer_) @@ -88,12 +94,10 @@ test_that("simple test and multiple edge cases",{ expect_equal(amatch(NA,NA, method="lcs",matchNA=FALSE,nomatch=0L), 0L) expect_equal(amatch(NA,NA, method="lcs",matchNA=FALSE,nomatch=7L), 7L) -}) +## amatch: Levenshtein -context("amatch: Levenshtein") - -test_that("simple test and multiple edge cases",{ +## simple test and multiple edge cases expect_equal(amatch("aa", c("ba","bb"), method="lv",maxDist=1L), 1L) expect_equal(amatch("aa",c("bb","bb"), method="lv",maxDist=1L), NA_integer_) expect_equal(amatch("aa",c("bbb"), method="lv",maxDist=2L), NA_integer_) @@ -107,11 +111,11 @@ test_that("simple test and multiple edge cases",{ expect_equal(amatch(NA,NA, method="lv",matchNA=FALSE,nomatch=0L), 0L) expect_equal(amatch(NA,NA, method="lv",matchNA=FALSE,nomatch=7L), 7L) expect_equal(amatch(c("m","fem"),c("male","female"),method="lv",maxDist=Inf), c(1,2)) -}) -context("amatch: qgrams") -test_that("simple test and multiple edge cases",{ +## amatch: qgrams + +## simple test and multiple edge cases expect_equal(amatch("aa", c("ba","bb"), method="qgram",maxDist=2), 1L) expect_equal(amatch("aa",c("bb","bb"), method="qgram",maxDist=1L), NA_integer_) expect_equal(amatch(NA,c(NA,NA),method="qgram"),1L) @@ -128,12 +132,11 @@ test_that("simple test and multiple edge cases",{ c("2100 EXAMPLE AVE NJ 8619", "600 EXAMPLE AVE NJ 8629"), method="jaccard") , 2L) -}) -context("amatch: Soundex") +## amatch: Soundex -test_that("simple test and multiple edge cases",{ +## simple test and multiple edge cases expect_equal(amatch("smith", c("smyth","smelt"), method="soundex"), 1L) expect_equal(amatch("smith",c("bb","bb"), method="soundex"), NA_integer_) expect_equal(amatch("smith",c("whashington"), method="soundex"), NA_integer_) @@ -146,12 +149,11 @@ test_that("simple test and multiple edge cases",{ expect_equal(amatch(NA,NA, method="soundex",matchNA=FALSE), NA_integer_) expect_equal(amatch(NA,NA, method="soundex",matchNA=FALSE,nomatch=0L), 0L) expect_equal(amatch(NA,NA, method="soundex",matchNA=FALSE,nomatch=7L), 7L) -}) -context("amatch: useBytes") +## amatch: useBytes -test_that("bytewise matching differs from character wise matching",{ +## bytewise matching differs from character wise matching x <- paste0('Mot',intToUtf8(0x00F6),'rhead') y <- c('bastard','Motorhead') jwdist <- round(1-(1/3)*(8/9 + 8/10 + 1),3) @@ -172,16 +174,14 @@ test_that("bytewise matching differs from character wise matching",{ expect_equal(amatch(x, y, method='qgram',maxDist=6, q=3, useBytes=TRUE, nomatch=0L), 0L); -}) +## seq_amatch -context("seq_amatch") - -test_that("Input checks for seq_amatch",{ +## Input checks for seq_amatch expect_equal(seq_amatch(list(1:10),list(1:10)),seq_amatch(1:10,1:10)) expect_equal(seq_amatch(list(1:10),list(1:10)),seq_amatch(as.numeric(1:10),as.numeric(1:10))) -}) -test_that("Some elementary tests for seq_amatch and seq_ain",{ + +## Some elementary tests for seq_amatch and seq_ain x <- list(c(1L,3L,2L)) table <- list( @@ -192,10 +192,11 @@ test_that("Some elementary tests for seq_amatch and seq_ain",{ expect_equal(seq_amatch(list(NA_integer_),table,maxDist=3),NA_integer_ ) expect_true(seq_ain(x,table,maxDist=3)) expect_false(seq_ain(x,table)) -}) - - +expect_error(seq_amatch(x,table,nthread=1:4)) +expect_error(seq_amatch(x,table,nthread="foo")) +expect_error(seq_amatch(x,table,nthread=integer(0))) +expect_error(seq_amatch(x,table,nthread=NULL)) diff --git a/pkg/inst/tinytest/test_gh_issue_59.R b/pkg/inst/tinytest/test_gh_issue_59.R new file mode 100644 index 0000000..b9e2e8e --- /dev/null +++ b/pkg/inst/tinytest/test_gh_issue_59.R @@ -0,0 +1,9 @@ +# this would crash R because of over-asking memory +# it depends on the system really, so we only run this at the +# comfort of our home +if (FALSE){ + x <- paste(letters[sample(1:length(letters),32800,replace=TRUE)], collapse="") + expect_error(stringdist(x,x)) +} + + diff --git a/pkg/inst/tinytest/test_gh_issue_78.R b/pkg/inst/tinytest/test_gh_issue_78.R new file mode 100644 index 0000000..2f06d9f --- /dev/null +++ b/pkg/inst/tinytest/test_gh_issue_78.R @@ -0,0 +1,9 @@ + +# x <- "IÑIGO", we avoid problems on Windows here. +x <- intToUtf8(c(73, 209, 73, 71, 79)) + +expect_equal(stringdist("INIGO", x, method="lv", useBytes=FALSE),1) +expect_equal(amatch("INIGO", x, method="lv",maxDist=1),1) + + + diff --git a/pkg/inst/tinytest/test_gh_issue_88.R b/pkg/inst/tinytest/test_gh_issue_88.R new file mode 100644 index 0000000..3549f9f --- /dev/null +++ b/pkg/inst/tinytest/test_gh_issue_88.R @@ -0,0 +1,8 @@ +options(sd_num_thread=1L) + +x <- c("ca", "abc", "cba") +expect_equal(stringsimmatrix(x), t(stringsimmatrix(x))) + + + + diff --git a/pkg/tests/testthat/testPhonetic.R b/pkg/inst/tinytest/test_phonetic.R similarity index 74% rename from pkg/tests/testthat/testPhonetic.R rename to pkg/inst/tinytest/test_phonetic.R index 740f65b..2e5a23a 100644 --- a/pkg/tests/testthat/testPhonetic.R +++ b/pkg/inst/tinytest/test_phonetic.R @@ -1,9 +1,8 @@ - -library(testthat) +options(sd_num_thread=2) ### ------------------------------------------------------------- -context("Phonetic") -test_that("Soundex",{ + +## Soundex testset <- "name;code Robert;R163 @@ -19,12 +18,12 @@ washington;W252 Lee;L000 NA;NA" testset <- read.csv2(textConnection(testset), stringsAsFactors=FALSE) - expect_that(phonetic(testset$name,"soundex"), equals(testset$code)) - expect_that(phonetic(testset$name,"soundex",useBytes=TRUE), equals(testset$code)) + expect_equal(phonetic(testset$name,"soundex"), testset$code) + expect_equal(phonetic(testset$name,"soundex",useBytes=TRUE), testset$code) expect_warning(phonetic(paste0('Mot',intToUtf8(0x00F6),'rhead'))) -}) -test_that("soundex handles encoding",{ + +## soundex handles encoding ouml <- intToUtf8("0x00F6") # non-ascii within string expect_warning(phonetic(paste0("Mot",ouml,"rhead"),method='soundex')) @@ -33,7 +32,7 @@ test_that("soundex handles encoding",{ # non-printable in string (carriage return) cr <- "\r" expect_warning(phonetic(paste0(cr,"hello"),method='soundex')) -}) + diff --git a/pkg/tests/testthat/testQgrams.R b/pkg/inst/tinytest/test_qgrams.R similarity index 81% rename from pkg/tests/testthat/testQgrams.R rename to pkg/inst/tinytest/test_qgrams.R index b961a79..66bf4fd 100644 --- a/pkg/tests/testthat/testQgrams.R +++ b/pkg/inst/tinytest/test_qgrams.R @@ -1,7 +1,7 @@ +options(sd_num_thread=2) +## qgrams -context("qgrams") - -test_that("qgram edge cases",{ +## qgram edge cases expect_equivalent(qgrams('a' , q=1), as.matrix(c(a=1))) # basic test expect_equivalent(qgrams('aa', q=1), as.matrix(c(a=2))) # idem expect_equivalent(qgrams(c('a','a'),q=1), as.matrix(c(a=2))) # count unique n-grams @@ -9,15 +9,15 @@ test_that("qgram edge cases",{ expect_equivalent(qgrams(NA,q=1), matrix(0,nrow=1,ncol=0)) # skip all expect_equivalent(qgrams(c("a","ab"), q=2), as.matrix(table("ab"))) # skip q>nchar expect_equivalent(qgrams(c("a"),q=2), matrix(0,nrow=1,ncol=0)) # skip all - expect_equivalent(qgrams(c(''),q=0), as.matrix(table(''))) # empty string, q=0 -}) + expect_equivalent(qgrams(c(''),q=0), matrix(table(''))) # empty string, q=0 + -test_that("qgrams",{ +## qgrams expect_equivalent(qgrams("a",q=1),array(1,dim=c(1,1))) expect_equivalent(qgrams("a",q=1,useBytes=TRUE),array(1,dim=c(1,1))) -}) -test_that("seq_qgrams",{ + +## seq_qgrams expect_equivalent( seq_qgrams(1:3,2:4,q=2) ,matrix(c( @@ -26,4 +26,4 @@ test_that("seq_qgrams",{ ,3,4,0,1 ),nrow=3,byrow=TRUE) ) -}) + diff --git a/pkg/inst/tinytest/test_seq_dist.R b/pkg/inst/tinytest/test_seq_dist.R new file mode 100644 index 0000000..e3db127 --- /dev/null +++ b/pkg/inst/tinytest/test_seq_dist.R @@ -0,0 +1,79 @@ +options(sd_num_thread=2) +## seq_dist +# tests against cases that used to segfault when we did not check +# NULL cases. +expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread=1:4)) +expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread="foo")) +expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread=integer(0))) +expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread=NULL)) + +# A simple test to see that everything is passed on to the correct +# algorithm +## Methods are selected and computed correctly + expect_equal( + seq_dist(a = list(c(1L,2L,3L)), b = list(c(2L,1L,3L)), method="osa") + , 1 ) + expect_equal( + seq_dist(a = list(c(1L,2L,3L)), b = list(c(2L,1L,3L)), method="lv") + , 2 ) + # the case setting 'dl' apart from 'osa' + expect_equal( + seq_dist(a = list(c(2L,1L)), b = list(c(1L,3L,2L)), method="dl") + , 2 ) + expect_equal( + seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="hamming") + , 1 ) + expect_equal( + seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="lcs") + , 2 ) + expect_equal( + seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="qgram",q=2) + , 4 ) + + expect_equal( + round(1-seq_dist(list(utf8ToInt("martha")),list(utf8ToInt("marhta")),method='jw'),3) + , 0.944 + ) + expect_error( + seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="soundex") + ) + + +## Conversion for non-integer-list arguments + expect_equal(seq_dist(list(c(1,2,3)),list(c(2,3,4))),seq_dist(as.numeric(c(1,2,3)),as.numeric(c(2,3,4)))) + expect_equal(seq_dist(list(c(1,2,3)),list(c(2,3,4))),seq_dist(c(1,2,3), c(2,3,4))) + expect_equal(seq_distmatrix(list(c(1,2,3)),list(c(2,3,4))), seq_distmatrix(as.numeric(c(1,2,3)),as.numeric(c(2,3,4)))) + expect_equal(seq_distmatrix(list(c(1,2,3)),list(c(2,3,4))), seq_distmatrix(c(1,2,3),c(2,3,4))) + expect_equal(seq_distmatrix(list(c(1,2,3))),seq_distmatrix(c(1,2,3))) + expect_equal(seq_distmatrix(list(c(1,2,3))),seq_distmatrix(as.numeric(c(1,2,3)))) + + +## Some edge cases + expect_equal(length(seq_dist(list(),list(c(1L)))),0) + expect_equal(length(seq_dist(list(),list())),0) + + +## Elementary tests on seq_distmatrix + + expect_equivalent(seq_distmatrix(1:10),dist(0)) + expect_equivalent(seq_distmatrix(1:10,list(1:10)),matrix(0)) + expect_equivalent( + as.matrix(seq_distmatrix(list(c(1,2,3),c(2,3,4))) ) + , matrix(c(0,2,2,0),nrow=2) + ) + expect_equal( + as.matrix(seq_distmatrix(list(x=c(1,2,3),y=c(2,3,4)),useNames="names") ) + , matrix(c(0,2,2,0),nrow=2,dimnames=list(c('x','y'),c('x','y'))) + ) + expect_equal( + seq_distmatrix(list(x=c(1,2,3),y=c(2,3,4)),list(x=c(1,2,3),y=c(2,3,4)),useNames="names") + , matrix(c(0,2,2,0),nrow=2,dimnames=list(c('x','y'),c('x','y'))) + ) + expect_equal(class(seq_distmatrix(list(c(1,2,3),c(2,3,4)))),"dist") + expect_equivalent( + as.matrix(seq_distmatrix(list(c(1,2,3),c(2,3,4))),seq_distmatrix(list(c(1,2,3),c(2,3,4)),list(c(1,2,3),c(2,3,4))) ) + , matrix(c(0,2,2,0),nrow=2) + ) + + + diff --git a/pkg/tests/testthat/testStringdist.R b/pkg/inst/tinytest/test_stringdist.R similarity index 86% rename from pkg/tests/testthat/testStringdist.R rename to pkg/inst/tinytest/test_stringdist.R index 4c53536..ffb45c6 100644 --- a/pkg/tests/testthat/testStringdist.R +++ b/pkg/inst/tinytest/test_stringdist.R @@ -1,25 +1,32 @@ - -library(testthat) +options(sd_num_thread=2) ### ------------------------------------------------------------- -context("General ") -test_that("Argument parsing",{ +##General +## Argument parsing expect_equal(stringdist(character(0),"a"),numeric(0)) expect_equal(stringdist("a",character(0)),numeric(0)) expect_error(stringdist("a","b",weight=c(-1,1,1,1))) expect_error(stringdist("a","b",weight=c(1,0,1,1))) expect_error(stringdist("a","b",weight=c(1,1,1,4))) + expect_error(stringdist("a","b",nthread=1:4)) + expect_error(stringdist("a","b",nthread="foo")) + expect_error(stringdist("a","b",nthread=integer(0))) + expect_error(stringdist("a","b",nthread=NULL)) expect_warning(stringdist(letters[1:3],letters[1:2])) expect_warning(stringdist(list('a'),'a')) expect_warning(stringdist('a',list('a'))) expect_warning(stringdistmatrix(list('a'))) expect_warning(stringdistmatrix(list('a'),list('b'))) -}) + expect_error(stringdistmatrix("a","b",nthread=1:4)) + expect_error(stringdistmatrit("a","b",nthread="foo")) + expect_error(stringdistmatrit("a","b",nthread=integer(0))) + expect_error(stringdistmatrit("a","b",nthread=NULL)) + ### ------------------------------------------------------------- -context("Optimal String Alignment") -test_that("Edge cases in OSA method",{ +## Optimal String Alignment +## Edge cases in OSA method expect_equal(stringdist( "", "",method='osa'),0) expect_equal(stringdist( "","a",method='osa'),1) expect_equal(stringdist("a", "",method='osa'),1) @@ -30,22 +37,15 @@ test_that("Edge cases in OSA method",{ expect_equal(sum(is.na(stringdist(c("a", NA, "b", "c"), c("aa", "bb", "cc", "dd")))),1) -}) - -test_that("max distance yields warning",{ - expect_warning(stringdist("abc","abc",method='osa',maxDist=1)) -}) - -test_that("transpositions are found",{ +## transpositions are found expect_equal(stringdist("ab","ba",method='osa'),1) -}) -test_that("Shortest argument is recycled",{ +## Shortest argument is recycled expect_equal(stringdist(c('a','b'),'a',method='osa'),c(0,1)) expect_equal(stringdist('a',c('a','b'),method='osa'),c(0,1)) -}) -test_that("weights are handled correctly",{ + +## weights are handled correctly # deletion expect_equal(stringdist("a","ab", method='osa',weight=c(0.5,1,1,1)),0.5) # insertion @@ -98,32 +98,29 @@ test_that("weights are handled correctly",{ expect_equal(stringdist("a","b",method="dl",weight=c(i=.1,d=1,s=.3,t=1)),.3) expect_equal(stringdist("leia","leela",method="dl",weight=c(i=1,d=.1,s=1,t=1)),2) -}) -test_that("NA's are handled correctly",{ +## NA's are handled correctly expect_true(is.na(stringdist(NA ,'a',method='osa'))) expect_true(is.na(stringdist('a',NA ,method='osa'))) expect_true(is.na(stringdist(NA ,NA ,method='osa'))) -}) ### ------------------------------------------------------------- -context("Levenstein") -test_that("Edge cases in Levenshtein method",{ +## Levenstein +## Edge cases in Levenshtein method expect_equal(stringdist( "", "",method='lv'),0) expect_equal(stringdist( "","a",method='lv'),1) expect_equal(stringdist("a", "",method='lv'),1) expect_equal(stringdist("a","a",method='lv'),0) expect_equal(sum(is.na(stringdist(c("a", NA, "b", "c"), c("aa", "bb", "cc", "dd"),method="lv"))),1) -}) -test_that("Shortest argument is recycled",{ +## Shortest argument is recycled expect_equal(stringdist(c('a','b'),'a',method='lv'),c(0,1)) expect_equal(stringdist('a',c('a','b'),method='lv'),c(0,1)) -}) -test_that("weights are handled correctly",{ + +## weights are handled correctly # deletion expect_equal(stringdist("a","ab", method='lv',weight=c(0.5,1,1)),0.5) # insertion @@ -135,30 +132,28 @@ test_that("weights are handled correctly",{ stringdist("abc","ac",method='lv',weight=c(0.5,1,1,1)), stringdist("ac","abc",method='lv',weight=c(1,0.5,1,1)) ) -}) -test_that("NA's are handled correctly",{ +## NA's are handled correctly expect_true(is.na(stringdist(NA ,'a',method='lv'))) expect_true(is.na(stringdist('a',NA ,method='lv'))) expect_true(is.na(stringdist(NA ,NA ,method='lv'))) -}) + ### ------------------------------------------------------------- -context("Damerau-Levenstein") -test_that("Edge cases in DL method",{ +## Damerau-Levenstein +## Edge cases in DL method expect_equal(stringdist( "", "",method='dl'),0) expect_equal(stringdist( "","a",method='dl'),1) expect_equal(stringdist("a", "",method='dl'),1) expect_equal(stringdist("a","a",method='dl'),0) -}) -test_that("Shortest argument is recycled",{ +## Shortest argument is recycled expect_equal(stringdist(c('a','b'),'a',method='dl'),c(0,1)) expect_equal(stringdist('a',c('a','b'),method='dl'),c(0,1)) -}) -test_that("weights are handled correctly",{ + +## weights are handled correctly # deletion expect_equal(stringdist("a","ab", method='dl',weight=c(0.5,1,1,1)),0.5) # insertion @@ -172,69 +167,67 @@ test_that("weights are handled correctly",{ stringdist("abc","ac",method='dl',weight=c(0.5,1,1,1)), stringdist("ac","abc",method='dl',weight=c(1,0.5,1,1)) ) -}) -test_that("NA's are handled correctly",{ +## NA's are handled correctly expect_true(is.na(stringdist(NA ,'a',method='dl'))) expect_true(is.na(stringdist('a',NA ,method='dl'))) expect_true(is.na(stringdist(NA ,NA ,method='dl'))) -}) + ### ------------------------------------------------------------- -context("Longest Common Substring") -test_that("Edge cases in LCS method",{ +## Longest Common Substring +## Edge cases in LCS method expect_equal(stringdist( "", "",method='lcs'),0) expect_equal(stringdist( "","a",method='lcs'),1) expect_equal(stringdist("a", "",method='lcs'),1) expect_equal(stringdist("a","a",method='lcs'),0) expect_equal(sum(is.na(stringdist(c("a", NA, "b", "c"), c("aa", "bb", "cc", "dd"),method="lcs"))),1) -}) -test_that("Shortest argument is recycled",{ + +## Shortest argument is recycled expect_equal(stringdist(c('a','b'),'a',method='lcs'),c(0,2)) expect_equal(stringdist('a',c('a','b'),method='lcs'),c(0,2)) -}) -test_that("NA's are handled correctly",{ + +## NA's are handled correctly expect_true(is.na(stringdist(NA ,'a',method='lcs'))) expect_true(is.na(stringdist('a',NA ,method='lcs'))) expect_true(is.na(stringdist(NA ,NA ,method='lcs'))) -}) + ### ------------------------------------------------------------- -context("Hamming distance") -test_that("Edge cases in DL method",{ +## Hamming distance +## Edge cases in DL method expect_equal(stringdist( "", "",method='h'),0) expect_equal(stringdist("a","a",method='h'),0) expect_equal(sum(is.na(stringdist(c("a", NA, "b", "c"), c("aa", "bb", "cc", "dd"),method="h"))),1) -}) -test_that("Unequal string lengths",{ + +## Unequal string lengths expect_equal(stringdist("aa","a",method="h"),Inf) expect_equal(stringdist("a","aa",method="h"),Inf) -}) -test_that("Shortest argument is recycled",{ +## Shortest argument is recycled expect_equal(stringdist(c('a','b'),'a',method='h'),c(0,1)) expect_equal(stringdist('a',c('a','b'),method='h'),c(0,1)) -}) -test_that("NA's are handled correctly",{ + +## NA's are handled correctly expect_true(is.na(stringdist(NA ,'a',method='h'))) expect_true(is.na(stringdist('a',NA ,method='h'))) expect_true(is.na(stringdist(NA ,NA ,method='h'))) -}) + ### ------------------------------------------------------------- -context("Q-gram distance") +## Q-gram distance -test_that("Edge cases in qgram method",{ +## Edge cases in qgram method expect_equal(stringdist( "", "",method='qgram',q=0), 0) expect_equal(stringdist( "", "",method='qgram',q=1),0) expect_equal(stringdist( "","a",method='qgram',q=1),1) @@ -242,32 +235,30 @@ test_that("Edge cases in qgram method",{ expect_equal(stringdist("a","a",method='qgram',q=1), 0) expect_error(stringdist("aa","bb",method='qgram',q=-2)) expect_equal(sum(is.na(stringdist(c("a", NA, "b", "c"), c("aa", "bb", "cc", "dd"),method="qgram"))),1) -}) -test_that("Shortest argument is recycled",{ +## Shortest argument is recycled expect_equal(stringdist(c('a','b'),'a',method='qgram',q=1),c(0,2)) expect_equal(stringdist('a',c('a','b'),method='qgram',q=1),c(0,2)) -}) -test_that("NA's are handled correctly",{ + +## NA's are handled correctly expect_true(is.na(stringdist(NA ,'a',method='qgram'))) expect_true(is.na(stringdist('a',NA ,method='qgram'))) expect_true(is.na(stringdist(NA ,NA ,method='qgram'))) -}) -test_that("binary tree is cleaned up properly in qgram-tree",{ +## binary tree is cleaned up properly in qgram-tree # explanation: the binary tree storing unique q-grams and q-gram counts is re-used when looping # over string pairs. (this is not the case with the unsorted lookup table in 'qgram') d <- stringdist('abcde',c('edcba','edcba'),method='qgram',q=2) expect_equal(d[1],d[2]) -}) + ### ------------------------------------------------------------- -context("cosine distance") +## cosine distance # basic engine is q-gram so we need limited testing -test_that("cosine distance computes correctly",{ +## cosine distance computes correctly expect_equal( round(stringdist("aaa","abc",method="cosine",q=1),8), round(1-1/sqrt(3),8) @@ -282,11 +273,11 @@ test_that("cosine distance computes correctly",{ # note that 1 - 2/(sqrt(2)*sqrt(2)) != 0, so this used to give ~2.2E-16. expect_equal( stringdist("ab","ab",method="cosine"),0.0,tolerance=0.0 ) expect_equal(sum(is.na(stringdist(c("a", NA, "b", "c"), c("aa", "bb", "cc", "dd"),method="cosine"))),1) -}) -context("Jaccard distance") + +### Jaccard distance # basic engine is q-gram so we need limited testing -test_that("Jaccard distance computes correctly",{ +## Jaccard distance computes correctly expect_equal( round(stringdist("aaa","abc",method="jaccard",q=1),8), round(1-1/3,8) @@ -296,12 +287,12 @@ test_that("Jaccard distance computes correctly",{ 1.0 ) expect_equal(sum(is.na(stringdist(c("a", NA, "b", "c"), c("aa", "bb", "cc", "dd"),method="jaccard"))),1) -}) + ### ------------------------------------------------------------- -context("Jaro") -test_that("basic examples and edge cases work",{ +## Jaro +## basic examples and edge cases work # strings of length 1 expect_equal(stringdist("a","a",method='jw'),0); expect_equal(stringdist("a","b",method='jw'),1); @@ -317,9 +308,9 @@ test_that("basic examples and edge cases work",{ expect_equal( stringdist("RICK WARREN","WARREN BUFFET",method="jw") , 1 - (1/3)*(7/13 + 7/11 + (7-3.5)/7)) -}) -test_that("Extended examples work",{ + +## Extended examples work # cases from wikipedia expect_equal( round(1-stringdist("martha","marhta",method='jw'),3), @@ -357,9 +348,9 @@ test_that("Extended examples work",{ expect_equal(stringdist("axiou","aaeeiioouu",method='jw'),1-(4/5+4/10 + 4/4)/3); # non-matching characters in both strings expect_equal(stringdist("abcdeu","abxde",method='jw'),1-(4/6+4/5+4/4)/3); -}) -test_that("distance is symmetric",{ + +## distance is symmetric expect_equal( round(stringdist("martha","marhta",method='jw'),8), round(stringdist("marhta","martha",method='jw'),8) @@ -368,22 +359,21 @@ test_that("distance is symmetric",{ round(stringdist("dicksonx","dixon",method='jw'),8), round(stringdist("dixon","dicksonx",method='jw'),8) ) -}) -test_that("Shortest argument is recycled",{ + +## Shortest argument is recycled expect_equal(stringdist(c('a','b'),'a',method='jw'),c(0,1)) expect_equal(stringdist('a',c('a','b'),method='jw'),c(0,1)) -}) -test_that("NA's are handled correctly",{ + +## NA's are handled correctly expect_true(is.na(stringdist(NA ,'a',method='jw'))) expect_true(is.na(stringdist('a',NA ,method='jw'))) expect_true(is.na(stringdist(NA ,NA ,method='jw'))) -}) ### ------------------------------------------------------------- -context("Jaro-Winkler") -test_that("wikipedia examples",{ +## Jaro-Winkler +## wikipedia examples expect_equal( round(stringdist("martha","marhta",method="jw",p=0.1),3), 1-0.961 @@ -396,22 +386,20 @@ test_that("wikipedia examples",{ round(stringdist("dixon","dicksonx",method="jw",p=0.1),3), round(1-0.813,3) ) -}) -test_that("Winkler's boost parameter",{ + +## Winkler's boost parameter expect_equal( stringdist("john doe","jane doe",method="jw",p=0.1, bt=0) , stringdist("john doe","jane doe",method="jw",p=0.1, bt=0.1)) - expect_lt( + expect_true( stringdist("john doe","jane doe",method="jw",p=0.1, bt=0.1) - , stringdist("john doe","jane doe",method="jw",p=0.1, bt=0.8)) -}) + < stringdist("john doe","jane doe",method="jw",p=0.1, bt=0.8)) - -context("stringdistmatrix") -test_that("dimensions work out",{ +## stringdistmatrix +## dimensions work out expect_equivalent( dim(stringdistmatrix(c("aa","bb","cc"),c("aa","cc"))), c(3,2) @@ -423,19 +411,18 @@ test_that("dimensions work out",{ expect_equivalent( # bug #28 dim(stringdistmatrix('foo',letters[1:3])), c(1,3) ) -}) -test_that("stringdistmatrix-lower-tri can output long vectors",{ +## stringdistmatrix-lower-tri can output long vectors # skipped on CRAN because of high memory use. - skip_on_cran() - # Error when input vector yields a vector too big for a long vector. - out <- tryCatch(stringdistmatrix(character(100663296+1),method="hamming") - , error = function(e) e$message ) - expect_equal(class(out),"character") - expect_match(out, "exceeds maximum allowed") -}) - -test_that('stringdistmatrix yields correct distances',{ + if (at_home()){ + # Error when input vector yields a vector too big for a long vector. + out <- tryCatch(stringdistmatrix(character(100663296+1),method="hamming") + , error = function(e) e$message ) + expect_equal(class(out),"character") + expect_true(grepl("exceeds maximum allowed",out)) + } + +## stringdistmatrix yields correct distances x <- paste0('Mot',intToUtf8(0x00F6),'rhead') # correct spelling y <- 'Motorhead' # Pissing off Lemmy. v <- c(x,y) @@ -453,9 +440,9 @@ test_that('stringdistmatrix yields correct distances',{ stringdistmatrix(v,v,useBytes=TRUE) , matrix(c(d11,d12,d12,d22),nrow=2,ncol=2) ) -}) -test_that("stringdistmatrix gives correct labels",{ + +## stringdistmatrix gives correct labels a <- c(k1="jan",k2="pier",k3="joris") b <- c(f1 = "jip", f2="janneke") expect_equal( @@ -471,10 +458,9 @@ test_that("stringdistmatrix gives correct labels",{ , list(c("k1","k2","k3"),c("f1","f2")) ) -}) -test_that("stringdistmatrix with single argument",{ +## stringdistmatrix with single argument d <- stringdistmatrix(c("aap","noot","mies","boom","roos","vis")) expect_equal(class(d),"dist") expect_equal(length(d),15) @@ -506,18 +492,17 @@ test_that("stringdistmatrix with single argument",{ , stringdistmatrix(x,x,method="jw",p=0.1) ) -}) -context("stringdist: useBytes") -test_that("useBytes gets NA",{ +## stringdist: useBytes +## useBytes gets NA expect_true(is.na(stringdist('a',NA,method='osa',useBytes=TRUE))) expect_true(is.na(stringdist('a',NA,method='lv',useBytes=TRUE))) expect_true(is.na(stringdist('a',NA,method='dl',useBytes=TRUE))) expect_true(is.na(stringdist('a',NA,method='hamming',useBytes=TRUE))) -}) -test_that("useBytes translates correctly to numeric",{ + +## useBytes translates correctly to numeric # smoketest set.seed(1) x <- sapply(sample(5:25,10,replace=TRUE),function(x) paste(letters[x],collapse="")) @@ -544,9 +529,8 @@ test_that("useBytes translates correctly to numeric",{ stringdist(x,y,method='qgram',q=3,useBytes=TRUE) , stringdist(x,y,method='qgram',q=3,useBytes=FALSE)) -}) -test_that("useBytes really analyses bytes",{ +## useBytes really analyses bytes x <- paste0('Mot',intToUtf8(0x00F6),'rhead') # correct spelling y <- 'Motorhead' # Pissing off Lemmy. expect_equal(stringdist(x,y,method='dl',useBytes=TRUE), 2) @@ -559,14 +543,12 @@ test_that("useBytes really analyses bytes",{ ) expect_equal(stringdist(x,y,method='lcs',useBytes=TRUE), 3) expect_equal(stringdist(x,y,method='qgram',q=3,useBytes=TRUE), 7) -}) - ### ------------------------------------------------------------- -context("Soundex distance") +## Soundex distance -test_that("",{ +## expect_equal(stringdist("", "0000",method='soundex'),0) expect_equal(stringdist("john","jan",method='soundex'),0) expect_equal(stringdist("schoen","son",method='soundex'),0) @@ -589,20 +571,20 @@ test_that("",{ x <- "Motorhead" y <- paste0("Mot",intToUtf8(0x00F6),"rhead") # with o-umlaut expect_warning(stringdist(x,y,method='soundex',useBytes=TRUE)) -}) -test_that("Shortest argument is recycled",{ + +## Shortest argument is recycled expect_equal(stringdist(c('a','b'),'a',method='soundex'),c(0,1)) expect_equal(stringdist('a',c('a','b'),method='soundex'),c(0,1)) -}) -test_that("NA's are handled correctly",{ + +## NA's are handled correctly expect_true(is.na(stringdist(NA ,'a',method='soundex'))) expect_true(is.na(stringdist('a',NA ,method='soundex'))) expect_true(is.na(stringdist(NA ,NA ,method='soundex'))) -}) -test_that("non-printable ascii and non-ascii encoding is detected",{ + +## non-printable ascii and non-ascii encoding is detected ouml <- intToUtf8("0x00F6") # non-ascii within string x <- paste0("Mot",ouml,"rhead") @@ -620,7 +602,7 @@ test_that("non-printable ascii and non-ascii encoding is detected",{ expect_warning(stringdist('Lemmy',x,method='soundex')) expect_warning(stringdist('Ozzy',y,method='soundex')) expect_warning(stringdist('Ozzy',z,method='soundex')) -}) + diff --git a/pkg/tests/testthat/testStringsim.R b/pkg/inst/tinytest/test_stringsim.R similarity index 50% rename from pkg/tests/testthat/testStringsim.R rename to pkg/inst/tinytest/test_stringsim.R index c3e5e60..4df23ec 100644 --- a/pkg/tests/testthat/testStringsim.R +++ b/pkg/inst/tinytest/test_stringsim.R @@ -1,39 +1,34 @@ -library(testthat) - -context("stringsim") +options(sd_num_thread=2) +## stringsim # We expect that two completely different strings have a similarity of # 0 and two completely equal strings a similarity of 1 methods <- c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex") for (method in methods) { - test_that(paste0("Similarity is between 0 and 1 for ", method), { - expect_that(stringsim("bb", "cc", method=method), equals(0)) - expect_that(stringsim("bb", "bb", method=method), equals(1)) - }) + expect_equal(stringsim("bb", "cc", method=method), 0) + expect_equal(stringsim("bb", "bb", method=method), 1) } + +## edgecases for (method in methods[c(1:5,9:10)]){ - test_that(paste0("Edge cases for ", method), { - expect_that(stringsim(c("a", ""), "", method=method), equals(c(0, 1))) + expect_equal(stringsim(c("a", ""), "", method=method), c(0, 1)) - expect_that(stringsim(c("kkk", "bbb"), "bbb", method=method), - equals(stringsim("bbb", c("kkk", "bbb"), method=method))) - }) + expect_equal(stringsim(c("kkk", "bbb"), "bbb", method=method), + stringsim("bbb", c("kkk", "bbb"), method=method)) } for (method in methods[6:8]){ - test_that(paste0("Edge cases for ", method), { - expect_that(stringsim(c("a", ""), "", method=method,q=0), equals(c(1, 1))) + expect_equal(stringsim(c("a", ""), "", method=method,q=0), c(1, 1)) - expect_that(stringsim(c("kkk", "bbb"), "bbb", method=method,q=0), - equals(stringsim("bbb", c("kkk", "bbb"), method=method,q=0))) - }) + expect_equal(stringsim(c("kkk", "bbb"), "bbb", method=method,q=0), + stringsim("bbb", c("kkk", "bbb"), method=method,q=0)) } -test_that("Stringsim gets correct values with or without useBytes",{ +## Stringsim gets correct values with or without useBytes x <- "ao" y <- paste0("a",intToUtf8(0x00F6)) # o-umlaut expect_equal(stringsim(x,y,method="osa", useBytes=FALSE), 1-1/2) @@ -54,16 +49,30 @@ test_that("Stringsim gets correct values with or without useBytes",{ expect_equal(stringsim(x,y,method="jaccard", q=1, useBytes=TRUE ), 1-3/4) expect_equal(stringsim(x,y,method="jw", useBytes=FALSE), 1-1/3) expect_equal(stringsim(x,y,method="jw", useBytes=TRUE ), (1/2 + 1/3 +1)/3) -}) -context("seq_sim") +# stringsimmatrix + x <- names(islands)[1:10] + y <- rev(x) # o-umlaut + expect_true(inherits(stringsimmatrix(x,y,method="osa", useBytes=FALSE), "matrix")) + expect_equal(dim(stringsimmatrix(x,y,method="osa", useBytes=FALSE)), c(10, 10)) + expect_equal(stringsimmatrix(x,y,method="osa", useBytes=FALSE)[2, 2], 0.2) + expect_true(inherits(stringsimmatrix(x,method="osa", useBytes=FALSE), "matrix")) + expect_equal(dim(stringsimmatrix(x,method="osa", useBytes=FALSE)), c(10, 10)) + expect_equal(stringsimmatrix(x,method="osa", useBytes=FALSE)[2, 9], 0.2) + expect_warning(stringdistmatrix(list('a'))) + expect_warning(stringdistmatrix(list('a'),list('b'))) + +## seq_sim -test_that("elementary seq_sim test",{ +# We used to have list(1:3, 2:4) and list(1:3). This occasionally +# gave failing tests, and only in the context of expect_equal (both +# for tinytest and testthat. Therefore this may point to a hard-to-reproduce +# bug in R's JIT compiler. expect_equal( - seq_sim(list(1:3,2:4),list(1:3)) - , stringsim(c("abc","bcd"),"abc") + seq_sim(list(c(1,2,3),c(2,3,4)), list(c(1,2,3)), method="cosine") + , stringsim(c("abc","bcd"),"abc", method="cosine") ) -}) + diff --git a/pkg/src/R_register_native.c b/pkg/src/R_register_native.c index 841c857..5f8cdc5 100644 --- a/pkg/src/R_register_native.c +++ b/pkg/src/R_register_native.c @@ -1,4 +1,3 @@ -#include "stringdist.h" #include #include #include // for NULL @@ -9,6 +8,7 @@ */ /* .Call calls */ +extern SEXP R_afind(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP R_all_int(SEXP); extern SEXP R_amatch(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP R_get_qgrams(SEXP, SEXP); @@ -18,6 +18,7 @@ extern SEXP R_soundex(SEXP, SEXP); extern SEXP R_stringdist(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { + {"R_afind", (DL_FUNC) &R_afind, 10}, {"R_all_int", (DL_FUNC) &R_all_int, 1}, {"R_amatch", (DL_FUNC) &R_amatch, 12}, {"R_get_qgrams", (DL_FUNC) &R_get_qgrams, 2}, @@ -31,8 +32,8 @@ static const R_CallMethodDef CallEntries[] = { void R_init_stringdist(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); - R_useDynamicSymbols(dll, TRUE); - + R_useDynamicSymbols(dll, FALSE); + /* used by external packages linking to internal xts code from C */ R_RegisterCCallable("stringdist","R_all_int",(DL_FUNC) &R_all_int); R_RegisterCCallable("stringdist","R_amatch",(DL_FUNC) &R_amatch); @@ -41,4 +42,5 @@ void R_init_stringdist(DllInfo *dll) R_RegisterCCallable("stringdist","R_lower_tri",(DL_FUNC) &R_lower_tri); R_RegisterCCallable("stringdist","R_soundex",(DL_FUNC) &R_soundex); R_RegisterCCallable("stringdist","R_stringdist",(DL_FUNC) &R_stringdist); + } diff --git a/pkg/src/Rstringdist.c b/pkg/src/Rstringdist.c index 7e62122..9db0d0a 100644 --- a/pkg/src/Rstringdist.c +++ b/pkg/src/Rstringdist.c @@ -17,35 +17,34 @@ * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com */ -#define USE_RINTERNALS #include #include -#include -#include #include -#include "utils.h" -#include "stringdist_pkg.h" #ifdef _OPENMP #include #endif +#include "utils.h" +#include "stringdist.h" #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) -// TODO: catch error and report. + static Stringdist *R_open_stringdist(Distance d, int max_len_a, int max_len_b, SEXP weight, SEXP p, SEXP bt, SEXP q){ Stringdist *sd = NULL; if (d == osa || d == lv || d == dl || d == hamming || d == lcs){ sd = open_stringdist(d, max_len_a, max_len_b, REAL(weight)); - } else if ( d == qgram || d == cosine || d == jaccard ){ + } else if (d == qgram || d == cosine || d == jaccard || d == running_cosine){ sd = open_stringdist(d, max_len_a, max_len_b, (unsigned int) INTEGER(q)[0]); } else if ( d == jw ){ sd = open_stringdist(d, max_len_a, max_len_b, REAL(weight), REAL(p)[0], REAL(bt)[0]); } else if (d == soundex) { sd = open_stringdist(d, max_len_a, max_len_b); } - + if ( sd == NULL ){ + error("Could not allocate enough memory"); + } return sd; } @@ -65,7 +64,7 @@ SEXP R_stringdist(SEXP a, SEXP b, SEXP method // output vector SEXP yy; - PROTECT(yy = allocVector(REALSXP, nt)); + yy = PROTECT(allocVector(REALSXP, nt)); double *y = REAL(yy); #ifdef _OPENMP @@ -136,7 +135,7 @@ SEXP R_amatch(SEXP x, SEXP table, SEXP method , ntable = length(table) , no_match = INTEGER(nomatch)[0] , match_na = INTEGER(matchNA)[0] - , bytes = INTEGER(x)[0] + , bytes = INTEGER(useBytes)[0] , ml_x = max_length(x) , ml_t = max_length(table) , intdist = TYPEOF(x) == VECSXP ? 1 : 0; // list of integers? @@ -150,7 +149,7 @@ SEXP R_amatch(SEXP x, SEXP table, SEXP method // output vector SEXP yy; - PROTECT(yy = allocVector(INTSXP, nx)); + yy = PROTECT(allocVector(INTSXP, nx)); int *y = INTEGER(yy); #ifdef _OPENMP @@ -215,7 +214,7 @@ SEXP R_amatch(SEXP x, SEXP table, SEXP method // Lower tridiagonal distance matrix for a single vector argument. -static int get_j(R_xlen_t k, int n){ +static int get_j(R_xlen_t k, R_xlen_t n){ double nd = (double) n; double kd = (double) k; double u = ceil( (2.*nd - 3.)/2. - sqrt(pow(nd-.5,2.) - 2.*(kd + 1.)) ); @@ -245,13 +244,13 @@ SEXP R_lower_tri(SEXP a, SEXP method , N = n*(n-1)/2; if ( n > MAXN ){ - error("Length of input vector (%d) exceeds maximum allowed for this platform (%d)",n,MAXN); + error("Length of input vector (%1.0f) exceeds maximum allowed for this platform (%1.0f)",(double) n,(double) MAXN); } // output vector SEXP yy; - PROTECT(yy = allocVector(REALSXP, N)); + yy = PROTECT(allocVector(REALSXP, N)); // nothing to do if n=1 if (n == 1L) goto end ; double *y = REAL(yy); @@ -280,13 +279,14 @@ SEXP R_lower_tri(SEXP a, SEXP method t = s + ml + 1L; int len_s, len_t, isna_s, isna_t - , i = 0, j = 0 + , j = 0 , thread_id = 0 - , n_threads = 1 - , col_max = n-1; + , n_threads = 1; R_xlen_t pp = 0 , k_start = 0 + , i = 0 + , col_max = n-1 , k_end = N; #ifdef _OPENMP @@ -297,7 +297,7 @@ SEXP R_lower_tri(SEXP a, SEXP method pp = N / n_threads; k_start = thread_id * pp; k_end = (thread_id < n_threads - 1 ) ? k_start + pp : N; - j = get_j(k_start,n); + j = get_j(k_start, n); i = k_start + j * (j - 2*n + 3)/2; for ( R_xlen_t k=k_start; k < k_end; k++ ){ i++; @@ -326,11 +326,122 @@ SEXP R_lower_tri(SEXP a, SEXP method return(yy); } +// afind +// For each string in 'a', return the starting position of +// the best match with 'pattern'. +SEXP R_afind(SEXP a, SEXP pattern, SEXP width + , SEXP method, SEXP weight, SEXP p, SEXP bt + , SEXP q, SEXP useBytes, SEXP nthrd) +{ + + int na = length(a) // nr of texts to search + , npat = length(pattern) // nr of patterns + , ml_a = max_length(a) // max length of searched string + , ml_b = max_length(pattern) // max length of the pattern. + , intdist = 0 // no distances between integer sequences (yet) + , bytes = INTEGER(useBytes)[0]; + + + int *window = INTEGER(width); // access the window widths. + + // output list + SEXP out_list; + out_list = PROTECT(allocVector(VECSXP, 2)); + + // output location + SEXP out_loc = PROTECT(allocMatrix(INTSXP, na, npat)); + SET_VECTOR_ELT(out_list,0, out_loc); + int *yloc = INTEGER(out_loc); + + // output distance + SEXP out_dist = PROTECT(allocMatrix(REALSXP, na, npat)); + SET_VECTOR_ELT(out_list,1, out_dist); + double *ydist = REAL(out_dist); + // Setup stringdist structure. + // find maximum window length + int max_window = 0; + for ( int i=0; i= len_s ){ // is the text shorter than the window? + yloc[offset + i] = 1L; + ydist[offset + i] = stringdist(sd, s, len_s, t, len_t); + } else { // slide window over text and compute distances + max_k = len_s - current_window; + d_min = R_PosInf; + k_min = 0; + for (int k = 0; k <= max_k; k++){ + d = stringdist(sd, s + k, current_window, t, len_t); + if ( d < d_min ){ + d_min = d; + k_min = k; + } + } // end loop over windows + yloc[offset + i] = k_min + 1; + ydist[offset + i] = d_min; + reset_stringdist(sd); + } + } // end loop over patterns + } // end loop over strings + close_stringdist(sd); + } // end parallel region + UNPROTECT(3); + return(out_list); + +} // helper function to determine whether all is INTSXP SEXP R_all_int(SEXP X){ - PROTECT(X); + SEXP all_int; all_int = PROTECT(allocVector(LGLSXP,1L)); @@ -343,7 +454,7 @@ SEXP R_all_int(SEXP X){ } } - UNPROTECT(2); + UNPROTECT(1); return all_int; } diff --git a/pkg/src/dist.h b/pkg/src/dist.h index d62cab2..cf7d55b 100644 --- a/pkg/src/dist.h +++ b/pkg/src/dist.h @@ -8,7 +8,7 @@ double osa_dist(unsigned int *, int, unsigned int *, int, double *, double *); dictionary *new_dictionary(unsigned int); -void free_dictionary(); +void free_dictionary(dictionary *); double dl_dist(unsigned int *, int, unsigned int *, int, double *, dictionary *, double *); double hamming_dist(unsigned int *, int, unsigned int *, int); double lcs_dist(unsigned int *, int, unsigned int *, int, double *); @@ -16,8 +16,8 @@ double lv_dist(unsigned int *, int, unsigned int *, int, double *, double *); double osa_dist(unsigned int *, int, unsigned int *, int, double *, double *); double jaro_winkler_dist(unsigned int *, int, unsigned int *, int, double, double, double *, double *); qtree *new_qtree(int, int); -void free_qtree(); -double qgram_dist(unsigned int *, int, unsigned int *t, int, unsigned int, qtree **, int); +void free_qtree(void); +double qgram_dist(unsigned int *, int, unsigned int *, int, unsigned int, qtree **, int); double soundex_dist(unsigned int *, int, unsigned int *, int, unsigned int *); - +double running_cosine_dist(unsigned int *, int, unsigned int *, int, unsigned int, qtree **, double *); #endif diff --git a/pkg/src/dl.c b/pkg/src/dl.c index 17005a9..9f6b9ea 100644 --- a/pkg/src/dl.c +++ b/pkg/src/dl.c @@ -35,9 +35,6 @@ #include "utils.h" -#ifdef _OPENMP -#include -#endif #include "dictionary.h" /* diff --git a/pkg/src/hamming.c b/pkg/src/hamming.c index a1fa9af..6b236d8 100644 --- a/pkg/src/hamming.c +++ b/pkg/src/hamming.c @@ -25,9 +25,6 @@ #include "utils.h" -#ifdef _OPENMP -#include -#endif double hamming_dist(unsigned int *a, int len_a, unsigned int *b, int len_b){ double h=0; diff --git a/pkg/src/jaro.c b/pkg/src/jaro.c index 37a8434..36593cc 100644 --- a/pkg/src/jaro.c +++ b/pkg/src/jaro.c @@ -19,9 +19,6 @@ #include "utils.h" #include -#ifdef _OPENMP -#include -#endif // Winkler's l-factor (nr of matching characters at beginning of the string). diff --git a/pkg/src/lcs.c b/pkg/src/lcs.c index 53fad7d..5138e82 100644 --- a/pkg/src/lcs.c +++ b/pkg/src/lcs.c @@ -17,9 +17,6 @@ * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com */ -#ifdef _OPENMP -#include -#endif #include "utils.h" /* Longest common substring diff --git a/pkg/src/lv.c b/pkg/src/lv.c index 5edc43f..4cb7c30 100644 --- a/pkg/src/lv.c +++ b/pkg/src/lv.c @@ -18,9 +18,6 @@ */ #include "utils.h" -#ifdef _OPENMP -#include -#endif /* Levenshtein distance diff --git a/pkg/src/osa.c b/pkg/src/osa.c index 5eaf66c..3d62561 100644 --- a/pkg/src/osa.c +++ b/pkg/src/osa.c @@ -18,9 +18,6 @@ */ #include "utils.h" -#ifdef _OPENMP -#include -#endif /* Optimal string alignment algorithm. * Computes Damerau-Levenshtein distance, restricted to single transpositions. diff --git a/pkg/src/qgram.c b/pkg/src/qgram.c index 56e7358..7497373 100644 --- a/pkg/src/qgram.c +++ b/pkg/src/qgram.c @@ -17,16 +17,14 @@ * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com */ +#ifdef _OPENMP +#include +#endif #define USE_RINTERNALS #include #include -#include -#include #include "utils.h" -#ifdef _OPENMP -#include -#endif #include "qtree.h" @@ -79,14 +77,14 @@ typedef struct { Box *box[MAXBOXES]; int nboxes; // number of boxes on the shelf int q; // the q in q-gram - int nstr; // the number of stings compared + int nstr; // the number of strings compared } Shelf; // A wall with shelfs: one for each thread. static Shelf wall[MAX_NUM_THREADS]; // When multithreaded, check what shelf we're storing stuff. -static inline int get_shelf_num(){ +static inline int get_shelf_num(void){ int thread_num=0; #ifdef _OPENMP thread_num = omp_get_thread_num(); @@ -131,7 +129,7 @@ static int add_box(int nnodes){ } -static void clear_shelf(){ +static void clear_shelf(void){ Shelf *shelf = &wall[get_shelf_num()]; for ( int i = 0; i < shelf->nboxes; i++ ){ free_box(shelf->box[i]); @@ -195,7 +193,7 @@ qtree *new_qtree(int q, int nstr){ return NULL; } -void free_qtree(){ +void free_qtree(void){ clear_shelf(); } @@ -221,8 +219,9 @@ static int compare(unsigned int *q1, unsigned int *q2, int q){ * q : the 'q' in q-gram * iLoc : To wich count location does this q-gram contribute? * nLoc : how many locations are there? + * node : int array of length nLoc, will contain contents of a node. */ -static qtree *push(qtree *Q, unsigned int *qgram, unsigned int q, int iLoc, int nLoc ){ +static qtree *push(qtree *Q, unsigned int *qgram, unsigned int q, int iLoc, int nLoc, double *node ){ int cond; if( Q == NULL ){ // new qgram Q = (qtree *) alloc( Qtree); @@ -238,22 +237,52 @@ static qtree *push(qtree *Q, unsigned int *qgram, unsigned int q, int iLoc, int memcpy(Q->qgram, qgram, sizeof(int) * q); Q->left = NULL; Q->right= NULL; + // copy content for all iLocs to output parameter. + if (node != NULL ) memcpy(node, Q->n, sizeof(double) * nLoc); } else if ( ( cond = compare(qgram, Q->qgram, q) ) == 1) { // qgram larger than the stored qgram - Q->left = push(Q->left, qgram, q, iLoc, nLoc); + Q->left = push(Q->left, qgram, q, iLoc, nLoc, node); } else if ( cond == -1 ){ // qgram smaller than the stored qgram - Q->right = push(Q->right, qgram, q, iLoc, nLoc); + Q->right = push(Q->right, qgram, q, iLoc, nLoc, node); } else { // qgram equal to stored qgram Q->n[iLoc] += 1; + // copy content for all iLocs to output parameter. + if (node != NULL ) memcpy(node,Q->n, sizeof(double) * nLoc); } return Q; } +/* pull qgram from binary tree: decrease valaue for one of the strings. + * + * qtree : see above + * qgram : see above + * q : the 'q' in q-gram + * iLoc : To wich count location does this q-gram contribute? + * nLoc : how many locations are there? + * node : int array of length nLoc, will contain contents of a node. + */ +static qtree *pull(qtree *Q, unsigned int *qgram, unsigned int q, int iLoc, int nLoc, double *node){ + if (Q == NULL) return(NULL); + int cond = compare(qgram, Q->qgram, q); + + if ( cond == -1 ){ // qgram smaller than stored qgram + pull(Q->right, qgram, q, iLoc, nLoc, node); + } else if (cond == 1) { //qram larger than stored qgram + pull(Q->left, qgram, q, iLoc, nLoc, node); + } else { // qgram equal to stored qgram + Q->n[iLoc] -= 1; + // copy content for all iLocs to output parameter. + if (node != NULL) memcpy(node, Q->n, sizeof(double) * nLoc); + } + return Q; +} + + /* push qgrams of a string into binary tree */ static qtree *push_string(unsigned int *str, int strlen, unsigned int q, qtree *Q, int iLoc, int nLoc){ qtree *P; for ( int i=0; i < (int) (strlen - q + 1); i++ ){ - P = push(Q, str + i, q, iLoc, nLoc); + P = push(Q, str + i, q, iLoc, nLoc, NULL); if ( P == NULL ){ free_qtree(); return NULL; @@ -281,7 +310,7 @@ static void getdist(qtree *Q, double *d){ /* get x.y,||x||and ||y|| for cosine distance from the tree and set all qgram-freqencies * to 0 so the tree van be reused. */ -static void getcosine(qtree *Q, double *d){ +static void getcosine(qtree *Q, double *d, int clean){ if ( Q == NULL ) return; // inner product d[0] += (double) Q->n[0] * Q->n[1]; @@ -289,12 +318,26 @@ static void getcosine(qtree *Q, double *d){ d[1] += (double) Q->n[0]*Q->n[0]; d[2] += (double) Q->n[1]*Q->n[1]; // clean up and continue - Q->n[0] = 0; - Q->n[1] = 0; - getcosine(Q->left,d); - getcosine(Q->right,d); + if (clean){ + Q->n[0] = 0; + Q->n[1] = 0; + } + getcosine(Q->left, d, clean); + getcosine(Q->right, d, clean); } +static double cosdist(double xy, double xx, double yy){ + // x and y are equal: return precisely zero. + if (xy == xx && xy == yy){ + return 0.0; + } else { + // use fabs to avoid numerical -0. + return ( fabs(1.0- xy/( sqrt(xx) * sqrt(yy))) ); + } +} + + + /* get jaccard distance from the tree and set all qgram-freqencies * to 0 so the tree van be reused. */ @@ -375,16 +418,8 @@ double qgram_dist( getdist(Q,dist); break; case 1: - getcosine(Q, dist); - if (dist[0]==dist[1] && dist[0]==dist[2]){ - // strings are equal. Prevent machine rounding about 0.0 - dist[0] = 0.0; - } else { - // there are several ways to express the rhs (including ones that give 0L - // at equal strings) but this has least chance of overflow - // fabs is taken to avoid numerical -0. - dist[0] = fabs(1.0 - dist[0]/(sqrt(dist[1]) * sqrt(dist[2]))); - } + getcosine(Q, dist, 1); + dist[0] = cosdist(dist[0],dist[1],dist[2]); break; case 2: getjaccard(*Qp,dist); @@ -398,6 +433,74 @@ double qgram_dist( } + + +/* +* s: text to search +* x: length of s +* t: pattern +* y: length of pattern +* q: size of q-gram +* qtree: a qtree object. +* store: length 3 array to store intermediate values; +*/ +double running_cosine_dist( + unsigned int *s, + int x, + unsigned int *t, + int y, + unsigned int q, + qtree **Qp, + double *store + ){ + + double d; + + unsigned int *first_qgram; + unsigned int *last_qgram; + + // pwi: value of qgram table of pattern and window at location + // where one qgram is removed. + // pwj: value of qgram table of pattern and window at location + // where one qgram is added. + double pwi[2] = {0.,0.}, pwj[2] = {0.,0.}; + + + if ( *Qp == NULL ){ // new tree, + // push the search pattern, location 0 + *Qp = push_string(t, y, q, *Qp, 0, 2); + // push the first window + *Qp = push_string(s, x, q, *Qp, 1, 2); + store[0] = store[1] = store[2] = 0; + // store[0]: w.p (inner product) + // store[1]: p.p (squared norm of pattern) + // store[2]: w.w (squared norm of window) + getcosine(*Qp, store, 0); + d = cosdist(store[0], store[1], store[2]); + } else { // we are running + first_qgram = s - 1; + last_qgram = s + y - q; + // special case: q-gram to remove is equal to qgram to add + if (compare(first_qgram, last_qgram, q) == 0){ + d = cosdist(store[0], store[1], store[2]); + } else { + // take first q-gram of the previous window from the table. + *Qp = pull(*Qp, s-1, q, 1, 2, pwi); + // add last qgram of the current window to the table. + *Qp = push(*Qp, s+y-q, q, 1, 2, pwj); + + store[0] = store[0] - pwi[0] + pwj[0]; + store[2] = store[2] + 2*(pwj[1] - pwi[1] - 1); + d = cosdist(store[0], store[2], store[1]); + } + } + + return d; + +} + + + /* R interface to qgram tabulator */ static void count_qtree(qtree *Q, int *n){ @@ -421,13 +524,10 @@ static void get_counts( qtree *Q, int q, int *qgrams, int nLoc, int *index, doub * */ SEXP R_get_qgrams(SEXP a, SEXP qq){ - PROTECT(a); - PROTECT(qq); int q = INTEGER(qq)[0]; if ( q < 0 ){ - UNPROTECT(2); error("q must be a nonnegative integer"); } @@ -446,7 +546,8 @@ SEXP R_get_qgrams(SEXP a, SEXP qq){ for ( int i=0; i < nstr; ++i ){ str = (unsigned int *) INTEGER(VECTOR_ELT(strlist,i)); nchar = length(VECTOR_ELT(strlist,i)); - if ( str[0] == NA_INTEGER + if ( nchar == 0 + || str[0] == NA_INTEGER || q > nchar || ( q == 0 && nchar > 0 ) ){ @@ -454,7 +555,6 @@ SEXP R_get_qgrams(SEXP a, SEXP qq){ } Q = push_string(str, nchar, q, Q, iLoc, nLoc); if ( Q == NULL ){ - UNPROTECT(2); error("could not allocate enough memory"); } } @@ -469,15 +569,15 @@ SEXP R_get_qgrams(SEXP a, SEXP qq){ count_qtree(Q,nqgram); SEXP qgrams, qcount; - PROTECT(qgrams = allocVector(INTSXP, q*nqgram[0])); - PROTECT(qcount = allocVector(REALSXP, nLoc*nqgram[0])); + qgrams = PROTECT(allocVector(INTSXP, q*nqgram[0])); + qcount = PROTECT(allocVector(REALSXP, nLoc*nqgram[0])); get_counts(Q, q, INTEGER(qgrams), nLoc, index, REAL(qcount)); setAttrib(qcount, install("qgrams"), qgrams); free_qtree(); - UNPROTECT(4); + UNPROTECT(2); return(qcount); } diff --git a/pkg/src/qtree.h b/pkg/src/qtree.h index f470d4f..d1c3ffc 100644 --- a/pkg/src/qtree.h +++ b/pkg/src/qtree.h @@ -1,10 +1,6 @@ #ifndef SD_QTREE_H #define SD_QTREE_H -#ifdef _OPENMP -#include -#endif - /* binary tree; dictionary of qgrams */ typedef struct qnode { diff --git a/pkg/src/soundex.c b/pkg/src/soundex.c index adf5f98..42dc098 100644 --- a/pkg/src/soundex.c +++ b/pkg/src/soundex.c @@ -17,14 +17,8 @@ * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com */ -#define USE_RINTERNALS -#include -#include #include "utils.h" #include -#ifdef _OPENMP -#include -#endif // Translate similar sounding consonants to numeric codes; vowels are all // translated to 'a' and voiceless characters (and other characters) are @@ -245,7 +239,7 @@ SEXP R_soundex(SEXP x, SEXP useBytes) { SET_STRING_ELT(y, i, R_NaString); } else { nfail += soundex(s, len_s, sndx_int); - for (unsigned int j = 0; j < 4; ++j) sndx[j] = sndx_int[j]; + for (unsigned int j = 0; j < 4; ++j) sndx[j] = (char) sndx_int[j]; sndx[4] = 0; SET_STRING_ELT(y, i, mkChar(sndx)); } diff --git a/pkg/src/stringdist.c b/pkg/src/stringdist.c index 142d904..750aef6 100644 --- a/pkg/src/stringdist.c +++ b/pkg/src/stringdist.c @@ -22,7 +22,7 @@ #include #include #include "dist.h" -#include "stringdist_pkg.h" +#include "stringdist.h" #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) @@ -32,7 +32,7 @@ /* * * - * TODO check for memory allocation failure + * */ Stringdist *open_stringdist(Distance d, int str_len_a, int str_len_b, ...){ va_list args; @@ -74,9 +74,13 @@ Stringdist *open_stringdist(Distance d, int str_len_a, int str_len_b, ...){ S->q = va_arg(args, unsigned int); S->tree = new_qtree(S->q, 2L); break; + case running_cosine : + S->q = va_arg(args, unsigned int); + S->tree = new_qtree(S->q, 2L); + S->work = (double *) malloc(3*sizeof(double)); + break; case jw : S->work = (double *) malloc( sizeof(double) * (str_len_a+str_len_b)); - S->weight = (double *) malloc(3L*sizeof(double)); memcpy(S->weight, va_arg(args, double *), 3*sizeof(double)); S->p = va_arg(args, double); @@ -86,10 +90,14 @@ Stringdist *open_stringdist(Distance d, int str_len_a, int str_len_b, ...){ break; default : break; - //TODO: set errno, return NULL }; va_end(args); + + if ( (d == osa || d == lv || d == dl || d == lcs || d== jw) && S->work == NULL ){ + close_stringdist(S); + return(NULL); + } return S; } @@ -102,11 +110,20 @@ void close_stringdist(Stringdist *S){ free_dictionary(S->dict); } if (S->distance == qgram || S->distance == cosine || S->distance == jaccard){ - free_qtree(S->tree); + free_qtree(); } free(S); } +void reset_stringdist(Stringdist *S){ + if (S->distance == running_cosine){ + free_qtree(); + S->tree = new_qtree(S->q, 2L); + } +} + + + double stringdist(Stringdist *S, unsigned int *str_a, int len_a, unsigned int *str_b, int len_b){ @@ -129,6 +146,8 @@ double stringdist(Stringdist *S, unsigned int *str_a, int len_a, unsigned int *s return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 1L); case jaccard : return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 2L); + case running_cosine: + return running_cosine_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, S->work); case jw : return jaro_winkler_dist(str_a, len_a, str_b, len_b, S->p, S->bt, S->weight, S->work); case soundex : diff --git a/pkg/src/stringdist.h b/pkg/src/stringdist.h index 1d3841d..cbc38d1 100644 --- a/pkg/src/stringdist.h +++ b/pkg/src/stringdist.h @@ -26,7 +26,19 @@ #include "qtree.h" #include "dist.h" -typedef enum Distance { osa, lv, dl, hamming, lcs, qgram, cosine, jaccard, jw, soundex} Distance; +typedef enum Distance { + osa + , lv + , dl + , hamming + , lcs + , qgram + , cosine + , jaccard + , jw + , soundex + , running_cosine} Distance; + typedef struct { Distance distance; // workspace @@ -52,6 +64,7 @@ Stringdist *open_stringdist(Distance, int, int, ...); double stringdist(Stringdist *, unsigned int *, int, unsigned int *, int); void close_stringdist(Stringdist *S); +void reset_stringdist(Stringdist *S); diff --git a/pkg/src/stringdist_pkg.h b/pkg/src/stringdist_pkg.h deleted file mode 100644 index 1d3841d..0000000 --- a/pkg/src/stringdist_pkg.h +++ /dev/null @@ -1,61 +0,0 @@ - -/* stringdist - a C library of string distance algorithms with an interface to R. - * Copyright (C) 2013 Mark van der Loo - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com - */ - - -#ifndef SD_STRINGDIST_H -#define SD_STRINGDIST_H - -#include "dictionary.h" -#include "qtree.h" -#include "dist.h" - -typedef enum Distance { osa, lv, dl, hamming, lcs, qgram, cosine, jaccard, jw, soundex} Distance; -typedef struct { - Distance distance; - // workspace - double *work; - // [optional] weight vector - double *weight; - // dictionary object for dl-distance - dictionary *dict; - // tree object to store q-grams - qtree *tree; - // the q in qgrams - unsigned int q; - // Winkler's penalty factor - double p; - // Winkler's boost threshold - double bt; - // fail indicator - unsigned int ifail; -} Stringdist; - -Stringdist *open_stringdist(Distance, int, int, ...); - -double stringdist(Stringdist *, unsigned int *, int, unsigned int *, int); - -void close_stringdist(Stringdist *S); - - - -#endif - - - diff --git a/pkg/src/utf8ToInt.c b/pkg/src/utf8ToInt.c index c64d687..35c0619 100644 --- a/pkg/src/utf8ToInt.c +++ b/pkg/src/utf8ToInt.c @@ -17,14 +17,8 @@ * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com */ -//#define USE_RINTERNALS -#include -#include -#include "utils.h" #include -#ifdef _OPENMP -#include -#endif +#include "utils.h" /* This function is gratefully copied from the R core distribution. @@ -152,7 +146,7 @@ static int utf8_to_int(const char *str, unsigned int *outbuf){ // Get one element from x (VECSXP or STRSXP) convert to usigned int if necessary and store in c // TODO: this can probably be a bit optimized by decreasing the use of the *_ELT macros. -unsigned int *get_elem(SEXP x, int i, int bytes, int intdist, int *len, int *isna, unsigned int *c){ +unsigned int *get_elem(SEXP x, R_xlen_t i, int bytes, int intdist, int *len, int *isna, unsigned int *c){ if ( intdist ){ // we need a copy with trailing zero in this case since some distances diff --git a/pkg/src/utils.c b/pkg/src/utils.c index bedb765..dc2f8d2 100644 --- a/pkg/src/utils.c +++ b/pkg/src/utils.c @@ -26,9 +26,18 @@ unsigned int max_length(SEXP x){ unsigned int t=0, m; - for (int i=0; i