diff --git a/qwarc/utils.py b/qwarc/utils.py index 5b4c531..83ce2cb 100644 --- a/qwarc/utils.py +++ b/qwarc/utils.py @@ -91,16 +91,17 @@ def maybe_str_get_between(x, a, b): def str_get_all_between(aStr, a, b): - '''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.''' - - #TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'. - # Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b). + '''Generator yielding every string between an occurrence of a in aStr and the following occurrence of b.''' + prevEnd = -1 for aOffset in find_all(aStr, a): + if aOffset < prevEnd: + continue offset = aOffset + len(a) bPos = aStr.find(b, offset) if bPos != -1: yield aStr[offset:bPos] + prevEnd = bPos + len(b) def maybe_str_get_all_between(x, a, b):