search: make the \b and \B anchors work correctly in both directions

That is: remove the special treatment of BOW anchors, and instead make regexes match against the whole line instead of against an artificially shortened one, because the latter method creates ghost matches: matches at the starting point of the search that aren't really matches when seen in the context of the whole line. This fixes https://savannah.gnu.org/bugs/?50030.
2017-01-26 16:24:18 +01:00 · 2017-01-26 16:24:18 +01:00 · 64aa8757a8
--- a/src/search.c
+++ b/src/search.c
@ -38,8 +38,6 @@ static bool history_changed = FALSE;
 #ifdef HAVE_REGEX_H
 static bool regexp_compiled = FALSE;
 	/* Have we compiled any regular expressions? */
-static bool bow_anchored = FALSE;
-	/* Whether a regex starts with a beginning-of-word anchor. */

 /* Compile the given regular expression and store it in search_regexp.
 * Return TRUE if the expression is valid, and FALSE otherwise. */
@ -62,10 +60,6 @@ bool regexp_init(const char *regexp)

    regexp_compiled = TRUE;

-    /* Remember whether the regex starts with a beginning-of-word anchor. */
-    bow_anchored = (strncmp(regexp, "\\<", 2) == 0 ||
-			strncmp(regexp, "\\b", 2) == 0);
-
    return TRUE;
 }

@ -302,24 +296,8 @@ int findnextstr(const char *needle, bool whole_word_only, size_t *match_len,
 	if (found != NULL) {
 #ifdef HAVE_REGEX_H
 	    /* When doing a regex search, compute the length of the match. */
-	    if (ISSET(USE_REGEXP)) {
+	    if (ISSET(USE_REGEXP))
 		found_len = regmatches[0].rm_eo - regmatches[0].rm_so;
-
-		/* If the regex starts with a BOW anchor, check that the found
-		 * match actually is the start of a word.  If not, continue. */
-		if (bow_anchored && found != line->data) {
-		    size_t before = move_mbleft(line->data, found - line->data);
-
-		    /* If a word char is before the match, skip this match. */
-		    if (is_word_mbchar(line->data + before, FALSE)) {
-			if (ISSET(BACKWARDS_SEARCH))
-			    from = line->data + before;
-			else
-			    from = found + move_mbright(found, 0);
-			continue;
-		    }
-		}
-	    }
 #endif
 #ifndef DISABLE_SPELLER
 	    /* When we're spell checking, a match should be a separate word;
@ -531,7 +509,7 @@ int replace_regexp(char *string, bool create)
 	     * subexpression match to the new line. */
 	    if (create) {
 		strncpy(string, openfile->current->data +
-			openfile->current_x + regmatches[num].rm_so, i);
+					regmatches[num].rm_so, i);
 		string += i;
 	    }
 	}
--- a/src/utils.c
+++ b/src/utils.c
@ -315,41 +315,66 @@ bool is_separate_word(size_t position, size_t length, const char *buf)
 }
 #endif /* !DISABLE_SPELLER */

-/* If we are searching backwards, we will find the last match that
- * starts no later than start.  Otherwise we find the first match
- * starting no earlier than start.  If we are doing a regexp search, we
- * fill in the global variable regmatches with at most 9 subexpression
- * matches.  Also, all .rm_so elements are relative to the start of the
- * whole match, so regmatches[0].rm_so == 0. */
+/* Return the position of the needle in the haystack, or NULL if not found.
+ * When searching backwards, we will find the last match that starts no later
+ * than the given start; otherwise, we find the first match starting no earlier
+ * than start.  If we are doing a regexp search, and we find a match, we fill
+ * in the global variable regmatches with at most 9 subexpression matches. */
 const char *strstrwrapper(const char *haystack, const char *needle,
 	const char *start)
 {
 #ifdef HAVE_REGEX_H
    if (ISSET(USE_REGEXP)) {
 	if (ISSET(BACKWARDS_SEARCH)) {
-	    if (regexec(&search_regexp, haystack, 1, regmatches, 0) == 0 &&
-			haystack + regmatches[0].rm_so <= start) {
-		const char *retval = haystack + regmatches[0].rm_so;
+	    size_t last_find, ceiling, far_end;
+	    size_t floor = 0, next_rung = 0;
+		/* The start of the search range, and the next start. */

-		/* Search forward until there are no more matches. */
-		while (regexec(&search_regexp, retval + 1, 1,
-			regmatches, REG_NOTBOL) == 0 &&
-			retval + regmatches[0].rm_so + 1 <= start)
-		    retval += regmatches[0].rm_so + 1;
-		/* Finally, put the subexpression matches in global
-		 * variable regmatches.  The REG_NOTBOL flag doesn't
-		 * matter now. */
-		regexec(&search_regexp, retval, 10, regmatches, 0);
-		return retval;
+	    if (regexec(&search_regexp, haystack, 1, regmatches, 0) != 0)
+		return NULL;
+
+	    far_end = strlen(haystack);
+	    ceiling = start - haystack;
+	    last_find = regmatches[0].rm_so;
+
+	    /* A result beyond the search range also means: no match. */
+	    if (last_find > ceiling)
+		return NULL;
+
+	    /* Move the start-of-range forward until there is no more match;
+	     * then the last match found is the first match backwards. */
+	    while (regmatches[0].rm_so <= ceiling) {
+		floor = next_rung;
+		last_find = regmatches[0].rm_so;
+		/* If this is the last possible match, don't try to advance. */
+		if (last_find == ceiling)
+		    break;
+		next_rung = move_mbright(haystack, last_find);
+		regmatches[0].rm_so = next_rung;
+		regmatches[0].rm_eo = far_end;
+		if (regexec(&search_regexp, haystack, 1, regmatches,
+					REG_STARTEND) != 0)
+		    break;
 	    }
-	} else if (regexec(&search_regexp, start, 10, regmatches,
-			(start > haystack) ? REG_NOTBOL : 0) == 0) {
-	    const char *retval = start + regmatches[0].rm_so;

-	    regexec(&search_regexp, retval, 10, regmatches, 0);
-	    return retval;
+	    /* Find the last match again, to get possible submatches. */
+	    regmatches[0].rm_so = floor;
+	    regmatches[0].rm_eo = far_end;
+	    if (regexec(&search_regexp, haystack, 10, regmatches,
+					REG_STARTEND) != 0)
+		statusline(ALERT, "BAD: failed to refind the match!");
+
+	    return haystack + regmatches[0].rm_so;
 	}
-	return NULL;
+
+	/* Do a forward regex search from the starting point. */
+	regmatches[0].rm_so = start - haystack;
+	regmatches[0].rm_eo = strlen(haystack);
+	if (regexec(&search_regexp, haystack, 10, regmatches,
+					REG_STARTEND) != 0)
+	    return NULL;
+	else
+	    return haystack + regmatches[0].rm_so;
    }
 #endif /* HAVE_REGEX_H */
    if (ISSET(CASE_SENSITIVE)) {