-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexact_matches.r
executable file
·63 lines (62 loc) · 4.64 KB
/
exact_matches.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
exact.matches <- function (search.expression, corpus.vector, pcre=TRUE, case.sens=TRUE, characters.around=0, lines.around=0, clean.up.spaces=TRUE, output.conc=TRUE) {
# Thanks to Earl Brown for feedback on an earlier version
if (is.factor(corpus.vector)) { corpus.vector <- as.character(corpus.vector) }
if (characters.around!=0 & lines.around!=0) {
stop("At least one of 'characters.around' and 'lines.around' has to be zero ...")
}
line.numbers.with.matches <- grep(search.expression, corpus.vector, perl=pcre, value=FALSE, ignore.case=!case.sens) # the numbers of lines that contain matches
if (any(line.numbers.with.matches)) { # if there are matches ...
if (characters.around!=0) {
lines.with.matches <- gsub("( ?_qW1aS3zX5eR7dF9cV_|_qW1aS3zX5eR7dF9cV_ ?)",
" ",
paste(corpus.vector, collapse = "_qW1aS3zX5eR7dF9cV_"),
perl=TRUE)
} else {
lines.with.matches <- corpus.vector[line.numbers.with.matches] # the lines that contain matches
}
matches <- gregexpr(search.expression, lines.with.matches, perl = pcre, ignore.case = !case.sens) # the start positions and lengths of matches
number.of.matches <- sapply(matches, length) # the number of matches per line (of the lines that have matches)
lines <- rep(lines.with.matches, number.of.matches) # the lines with matches, each as many times as it has matches
line.numbers.with.matches <- rep(line.numbers.with.matches, number.of.matches)
starts <- unlist(matches) # starting positions of matches
stops <- starts + unlist(sapply(matches, attr, "match.length")) - 1 # end positions of matches
exact.string.matches <- substr(lines, starts, stops) # the exact matches
if (output.conc) {
lines.with.delimited.matches <- paste( # the lines with the tab-delimited matches
substr(lines, if (characters.around!=0) starts-characters.around else 1, starts-1), "\t", # preceding contexts
exact.string.matches, "\t", # matches
substr(lines, stops+1, if (characters.around!=0) stops+characters.around else nchar(lines)), # subsequent contexts
sep="")
if (lines.around!=0) {
corpus.vector <- append(corpus.vector, rep("", lines.around))
starts.of.previous.lines <- pmax(0, line.numbers.with.matches - lines.around)
ends.of.subsequent.lines <- pmin(line.numbers.with.matches + lines.around, length(corpus.vector))
for (current.line.with.delimited.match in seq(lines.with.delimited.matches)) {
lines.with.delimited.matches[current.line.with.delimited.match] <- paste(
paste(corpus.vector[starts.of.previous.lines[current.line.with.delimited.match]:(line.numbers.with.matches[current.line.with.delimited.match]-1)], collapse=" "),
lines.with.delimited.matches[current.line.with.delimited.match],
paste(corpus.vector[(line.numbers.with.matches[current.line.with.delimited.match]+1):ends.of.subsequent.lines[current.line.with.delimited.match]], collapse=" "),
sep=" ")
}
}
# cleaning output as necessary/requested by user
if (clean.up.spaces) { # clean up spaces around tabs
lines.with.delimited.matches <- gsub(" *\t *", "\t", lines.with.delimited.matches, perl=TRUE)
}
lines.with.delimited.matches <- gsub("(^ {1,}| {1,}$)", "", lines.with.delimited.matches, perl=TRUE) # clean up leading and trailing spaces
}
output.list <- list(exact.string.matches,
if (characters.around!=0) starts else line.numbers.with.matches, # starting character positions or the numbers of lines with matches, each as many times as it has matches
length(unique(line.numbers.with.matches))/sum(nzchar(corpus.vector)),
if (output.conc) lines.with.delimited.matches else NA,
c(Pattern = search.expression, "Corpus (1st 100 char.)"=substr(paste(corpus.vector, collapse=" "), 1, 100), PCRE=pcre, "Case-sensitive"=case.sens),
"1.3 (22 March 2014)")
names(output.list) <- c("Exact matches",
paste("Locations of matches (", ifelse (characters.around!=0, "characters", "lines"), ")", sep=""),
"Proportion of non-empty corpus parts with matches",
"Lines with delimited matches (if requested)",
"Search parameters",
"Version (date)")
return(output.list)
}
}