Add Rabin-Karp String Matching Algorithm (#13918)

Navadeep0007 · Navadeep0007 · commit 38daa15d113e · 2025-11-23T19:09:07.000+05:30
diff --git a/strings/rabin_karp.py b/strings/rabin_karp.py
@@ -1,91 +1,243 @@
-# Numbers of alphabet which we call base
-alphabet_size = 256
-# Modulus to hash a string
-modulus = 1000003
+"""
+Rabin-Karp String Matching Algorithm
 
+The Rabin-Karp algorithm uses hashing to find patterns in text.
+It employs a rolling hash technique for efficient pattern searching.
 
-def rabin_karp(pattern: str, text: str) -> bool:
-    """
-    The Rabin-Karp Algorithm for finding a pattern within a piece of text
-    with complexity O(nm), most efficient when it is used with multiple patterns
-    as it is able to check if any of a set of patterns match a section of text in o(1)
-    given the precomputed hashes.
+Time Complexity:
+- Average case: O(n + m) where n is text length, m is pattern length
+- Worst case: O(nm) when many spurious hits occur
 
-    This will be the simple version which only assumes one pattern is being searched
-    for but it's not hard to modify
+Space Complexity: O(1) for single pattern, O(k) for k patterns
 
-    1) Calculate pattern hash
+Applications:
+- Plagiarism detection
+- DNA sequence matching
+- Multiple pattern searching
+- Finding duplicate content
+"""
 
-    2) Step through the text one character at a time passing a window with the same
-        length as the pattern
-        calculating the hash of the text within the window compare it with the hash
-        of the pattern. Only testing equality if the hashes match
-    """
-    p_len = len(pattern)
-    t_len = len(text)
-    if p_len > t_len:
-        return False
 
-    p_hash = 0
-    text_hash = 0
-    modulus_power = 1
+def rabin_karp_search(
+    text: str, pattern: str, base: int = 256, modulus: int = 101
+) -> list[int]:
+    """
+    Search for a pattern in text using Rabin-Karp algorithm.
+
+    Args:
+        text: The text to search in
+        pattern: The pattern to search for
+        base: The base for hash calculation (default: 256 for ASCII)
+        modulus: The modulus for hash calculation (prime number)
+
+    Returns:
+        List of starting indices where pattern is found
+
+    Examples:
+        >>> rabin_karp_search("hello world hello", "hello")
+        [0, 12]
+        >>> rabin_karp_search("aaaa", "aa")
+        [0, 1, 2]
+        >>> rabin_karp_search("abc", "xyz")
+        []
+        >>> rabin_karp_search("", "a")
+        []
+        >>> rabin_karp_search("a", "")
+        []
+        >>> rabin_karp_search("abcdefg", "cde")
+        [2]
+        >>> rabin_karp_search("ABABDABACDABABCABAB", "ABABCABAB")
+        [10]
+        >>> rabin_karp_search("test test test", "test")
+        [0, 5, 10]
+    """
+    if not pattern or not text or len(pattern) > len(text):
+        return []
 
-    # Calculating the hash of pattern and substring of text
-    for i in range(p_len):
-        p_hash = (ord(pattern[i]) + p_hash * alphabet_size) % modulus
-        text_hash = (ord(text[i]) + text_hash * alphabet_size) % modulus
-        if i == p_len - 1:
-            continue
-        modulus_power = (modulus_power * alphabet_size) % modulus
+    n = len(text)
+    m = len(pattern)
+    matches = []
 
-    for i in range(t_len - p_len + 1):
-        if text_hash == p_hash and text[i : i + p_len] == pattern:
-            return True
-        if i == t_len - p_len:
+    # Calculate hash value for pattern and first window of text
+    pattern_hash = 0
+    text_hash = 0
+    h = 1
+
+    # The value of h would be "pow(base, m-1) % modulus"
+    for _ in range(m - 1):
+        h = (h * base) % modulus
+
+    # Calculate initial hash values
+    for i in range(m):
+        pattern_hash = (base * pattern_hash + ord(pattern[i])) % modulus
+        text_hash = (base * text_hash + ord(text[i])) % modulus
+
+    # Slide the pattern over text one by one
+    for i in range(n - m + 1):
+        # Check if hash values match
+        if pattern_hash == text_hash:
+            # Verify character by character to avoid spurious hits
+            if text[i : i + m] == pattern:
+                matches.append(i)
+
+        # Calculate hash for next window (rolling hash)
+        if i < n - m:
+            # Remove leading character and add trailing character
+            text_hash = (
+                base * (text_hash - ord(text[i]) * h) + ord(text[i + m])
+            ) % modulus
+
+            # Handle negative hash values
+            if text_hash < 0:
+                text_hash += modulus
+
+    return matches
+
+
+def rabin_karp_multiple(
+    text: str, patterns: list[str], base: int = 256, modulus: int = 101
+) -> dict[str, list[int]]:
+    """
+    Search for multiple patterns in text using Rabin-Karp algorithm.
+
+    This is more efficient than running single pattern search multiple times
+    because we only scan the text once.
+
+    Args:
+        text: The text to search in
+        patterns: List of patterns to search for
+        base: The base for hash calculation
+        modulus: The modulus for hash calculation
+
+    Returns:
+        Dictionary mapping each pattern to list of indices where found
+
+    Examples:
+        >>> result = rabin_karp_multiple("hello world hello", ["hello", "world"])
+        >>> result == {"hello": [0, 12], "world": [6]}
+        True
+        >>> result = rabin_karp_multiple("aaaa", ["aa", "aaa"])
+        >>> result == {"aa": [0, 1, 2], "aaa": [0, 1]}
+        True
+        >>> result = rabin_karp_multiple("test", ["abc", "xyz"])
+        >>> result == {"abc": [], "xyz": []}
+        True
+        >>> result = rabin_karp_multiple("", ["a", "b"])
+        >>> result == {"a": [], "b": []}
+        True
+        >>> result = rabin_karp_multiple("abcdef", ["ab", "cd", "ef"])
+        >>> result == {"ab": [0], "cd": [2], "ef": [4]}
+        True
+    """
+    if not text or not patterns:
+        return {pattern: [] for pattern in patterns}
+
+    # Group patterns by length for efficient processing
+    patterns_by_length: dict[int, list[str]] = {}
+    for pattern in patterns:
+        if pattern:  # Skip empty patterns
+            length = len(pattern)
+            if length not in patterns_by_length:
+                patterns_by_length[length] = []
+            patterns_by_length[length].append(pattern)
+
+    results = {pattern: [] for pattern in patterns}
+
+    # Process each group of patterns with same length
+    for pattern_length, pattern_group in patterns_by_length.items():
+        if pattern_length > len(text):
             continue
-        # Calculate the https://en.wikipedia.org/wiki/Rolling_hash
-        text_hash = (
-            (text_hash - ord(text[i]) * modulus_power) * alphabet_size
-            + ord(text[i + p_len])
-        ) % modulus
-    return False
-
 
-def test_rabin_karp() -> None:
+        # Calculate pattern hashes
+        pattern_hashes = {}
+        for pattern in pattern_group:
+            pattern_hash = 0
+            for char in pattern:
+                pattern_hash = (base * pattern_hash + ord(char)) % modulus
+            pattern_hashes[pattern] = pattern_hash
+
+        # Calculate hash for first window
+        text_hash = 0
+        h = 1
+        for _ in range(pattern_length - 1):
+            h = (h * base) % modulus
+
+        for i in range(pattern_length):
+            text_hash = (base * text_hash + ord(text[i])) % modulus
+
+        # Slide the window over text
+        for i in range(len(text) - pattern_length + 1):
+            # Check if current hash matches any pattern hash
+            for pattern, pattern_hash in pattern_hashes.items():
+                if text_hash == pattern_hash:
+                    # Verify to avoid spurious hits
+                    if text[i : i + pattern_length] == pattern:
+                        results[pattern].append(i)
+
+            # Calculate hash for next window
+            if i < len(text) - pattern_length:
+                text_hash = (
+                    base * (text_hash - ord(text[i]) * h)
+                    + ord(text[i + pattern_length])
+                ) % modulus
+
+                if text_hash < 0:
+                    text_hash += modulus
+
+    return results
+
+
+def rabin_karp_search_optimized(
+    text: str, pattern: str, base: int = 256, modulus: int = 1_000_000_007
+) -> list[int]:
     """
-    >>> test_rabin_karp()
-    Success.
+    Optimized version with larger modulus to reduce collisions.
+
+    Using a larger prime modulus (10^9 + 7) significantly reduces
+    the probability of hash collisions, improving average-case performance.
+
+    Args:
+        text: The text to search in
+        pattern: The pattern to search for
+        base: The base for hash calculation
+        modulus: Large prime modulus (default: 10^9 + 7)
+
+    Returns:
+        List of starting indices where pattern is found
+
+    Examples:
+        >>> rabin_karp_search_optimized("hello world", "world")
+        [6]
+        >>> rabin_karp_search_optimized("aaabaaaa", "aaaa")
+        [4]
+        >>> rabin_karp_search_optimized("abc", "d")
+        []
     """
-    # Test 1)
-    pattern = "abc1abc12"
-    text1 = "alskfjaldsabc1abc1abc12k23adsfabcabc"
-    text2 = "alskfjaldsk23adsfabcabc"
-    assert rabin_karp(pattern, text1)
-    assert not rabin_karp(pattern, text2)
-
-    # Test 2)
-    pattern = "ABABX"
-    text = "ABABZABABYABABX"
-    assert rabin_karp(pattern, text)
-
-    # Test 3)
-    pattern = "AAAB"
-    text = "ABAAAAAB"
-    assert rabin_karp(pattern, text)
-
-    # Test 4)
-    pattern = "abcdabcy"
-    text = "abcxabcdabxabcdabcdabcy"
-    assert rabin_karp(pattern, text)
-
-    # Test 5)
-    pattern = "Lü"
-    text = "Lüsai"
-    assert rabin_karp(pattern, text)
-    pattern = "Lue"
-    assert not rabin_karp(pattern, text)
-    print("Success.")
+    return rabin_karp_search(text, pattern, base, modulus)
 
 
 if __name__ == "__main__":
-    test_rabin_karp()
+    import doctest
+
+    doctest.testmod()
+
+    # Performance demonstration
+    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit" * 100
+    pattern = "consectetur"
+
+    print("Rabin-Karp String Matching Algorithm Demo")
+    print("=" * 50)
+
+    # Single pattern search
+    matches = rabin_karp_search(text, pattern)
+    print(f"\nSearching for '{pattern}' in text ({len(text)} chars)")
+    print(f"Found {len(matches)} matches at indices: {matches[:5]}...")
+
+    # Multiple pattern search
+    patterns = ["Lorem", "ipsum", "consectetur", "adipiscing"]
+    results = rabin_karp_multiple(text, patterns)
+    print(f"\nSearching for {len(patterns)} patterns:")
+    for p, indices in results.items():
+        print(f"  '{p}': {len(indices)} matches")
+
+    print("\n✓ All tests passed!")