From a4cf1a42250b1323cb8998aa7d87a7e252443897 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 12 Jul 2020 19:34:55 +0000 Subject: [PATCH] Fix str_get_all_between yielding half-overlapping matches --- qwarc/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/qwarc/utils.py b/qwarc/utils.py index 5b4c531..83ce2cb 100644 --- a/qwarc/utils.py +++ b/qwarc/utils.py @@ -91,16 +91,17 @@ def maybe_str_get_between(x, a, b): def str_get_all_between(aStr, a, b): - '''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.''' - - #TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'. - # Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b). + '''Generator yielding every string between an occurrence of a in aStr and the following occurrence of b.''' + prevEnd = -1 for aOffset in find_all(aStr, a): + if aOffset < prevEnd: + continue offset = aOffset + len(a) bPos = aStr.find(b, offset) if bPos != -1: yield aStr[offset:bPos] + prevEnd = bPos + len(b) def maybe_str_get_all_between(x, a, b):