1+ """
2+ Autocomplete System using SQLite3 for Persistence with N-gram optimization
3+
4+ This module implements an autocomplete system that learns word sequences from training sentences
5+ and predicts the most likely next word based on the learned patterns. It uses SQLite3 for
6+ persistent storage of word mappings and predictions.
7+ """
8+
9+ import sqlite3
10+ import json
11+ from typing import Dict , List , Optional , Tuple , Union
12+
13+ class AutoComplete :
14+ """
15+ An autocomplete system that trains on text data and predicts subsequent words using N-gram model.
16+
17+ The system works by:
18+ 1. Building N-gram maps that track how often each N-gram is followed by another word
19+ 2. Maintaining predictions for the most likely next word for each N-gram
20+ 3. Storing all data in an SQLite database for persistence
21+ """
22+
23+ def __init__ (self , n = 2 ) -> None :
24+ """
25+ Initialize the AutoComplete system and set up the database.
26+
27+ Creates an SQLite database connection and initializes required tables
28+ (NGramMap and NGramPrediction) if they don't already exist. These tables
29+ store the N-gram transition mappings and precomputed predictions respectively.
30+ """
31+ self .n = n
32+ # Establish database connection with autocommit enabled
33+ self .conn : sqlite3 .Connection = sqlite3 .connect ("autocompleteDB.sqlite3" , autocommit = True )
34+ cursor : sqlite3 .Cursor = self .conn .cursor ()
35+
36+ # Check if tables exist
37+ cursor .execute ("SELECT name FROM sqlite_master WHERE name='NGramMap'" )
38+ tables_exist : Optional [Tuple [str ]] = cursor .fetchone ()
39+
40+ if not tables_exist :
41+ # Create tables if they don't exist
42+ cursor .execute ("CREATE TABLE NGramMap(name TEXT, value TEXT)" )
43+ cursor .execute ("CREATE TABLE NGramPrediction(name TEXT, value TEXT)" )
44+
45+ # Initialize with empty dictionaries
46+ cursor .execute ("INSERT INTO NGramMap VALUES (?, ?)" , ("ngramsmap" , "{}" ))
47+ cursor .execute ("INSERT INTO NGramPrediction VALUES (?, ?)" , ("ngrampredictions" , "{}" ))
48+
49+ def generate_ngrams (self , words_list : List [str ]) -> List [Tuple [str ]]:
50+ """
51+ Generate N-grams from a list of words.
52+ """
53+ ngrams = []
54+ for i in range (len (words_list ) - self .n + 1 ):
55+ ngrams .append (tuple (words_list [i :i + self .n ]))
56+ return ngrams
57+
58+ def train (self , sentence : str ) -> str :
59+ """
60+ Train the autocomplete system with a single sentence.
61+
62+ Processes the input sentence to update:
63+ 1. N-gram transition counts (NGramMap)
64+ 2. Most likely next word predictions (NGramPrediction)
65+
66+ Args:
67+ sentence: A string containing the training text. Words should be space-separated.
68+
69+ Returns:
70+ Confirmation message indicating training completion.
71+ """
72+ cursor : sqlite3 .Cursor = self .conn .cursor ()
73+
74+ # Split sentence into individual words
75+ words_list : List [str ] = sentence .split (" " )
76+
77+ # Retrieve existing N-gram map and predictions from database
78+ cursor .execute ("SELECT value FROM NGramMap WHERE name='ngramsmap'" )
79+ ngrams_map_str : str = cursor .fetchone ()[0 ]
80+ ngrams_map : Dict [Tuple [str ], Dict [str , int ]] = json .loads (ngrams_map_str , object_hook = lambda d : {tuple (k .split ()): v for k , v in d .items ()})
81+
82+ cursor .execute ("SELECT value FROM NGramPrediction WHERE name='ngrampredictions'" )
83+ predictions_str : str = cursor .fetchone ()[0 ]
84+ predictions : Dict [Tuple [str ], Dict [str , Union [str , int ]]] = json .loads (predictions_str , object_hook = lambda d : {tuple (k .split ()): v for k , v in d .items ()})
85+
86+ # Generate N-grams
87+ ngrams = self .generate_ngrams (words_list )
88+
89+ # Process each N-gram and the next word
90+ for i in range (len (ngrams ) - 1 ):
91+ curr_ngram : Tuple [str ] = ngrams [i ]
92+ next_word : str = words_list [i + self .n ]
93+
94+ # Update N-gram transition counts
95+ if curr_ngram not in ngrams_map :
96+ ngrams_map [curr_ngram ] = {}
97+
98+ if next_word not in ngrams_map [curr_ngram ]:
99+ ngrams_map [curr_ngram ][next_word ] = 1
100+ else :
101+ ngrams_map [curr_ngram ][next_word ] += 1
102+
103+ # Update predictions with most frequent next word
104+ if curr_ngram not in predictions :
105+ predictions [curr_ngram ] = {
106+ 'completion_word' : next_word ,
107+ 'completion_count' : 1
108+ }
109+ else :
110+ # Update if current next word is more frequent
111+ if ngrams_map [curr_ngram ][next_word ] > predictions [curr_ngram ]['completion_count' ]:
112+ predictions [curr_ngram ]['completion_word' ] = next_word
113+ predictions [curr_ngram ]['completion_count' ] = ngrams_map [curr_ngram ][next_word ]
114+
115+ # Save updated data back to database
116+ updated_ngrams_map : str = json .dumps ({ ' ' .join (k ): v for k , v in ngrams_map .items () })
117+ updated_predictions : str = json .dumps ({ ' ' .join (k ): v for k , v in predictions .items () })
118+
119+ cursor .execute ("UPDATE NGramMap SET value = ? WHERE name='ngramsmap'" , (updated_ngrams_map ,))
120+ cursor .execute ("UPDATE NGramPrediction SET value = ? WHERE name='ngrampredictions'" , (updated_predictions ,))
121+
122+ return "training complete"
123+
124+ def predict (self , words : str ) -> Optional [str ]:
125+ """
126+ Predict the most likely next word for a given input sequence of words.
127+
128+ Args:
129+ words: The input sequence of words to generate a completion for.
130+
131+ Returns:
132+ The most likely next word, or None if no prediction exists.
133+
134+ Raises:
135+ KeyError: If the input sequence of words has no entries in the prediction database.
136+ """
137+ cursor : sqlite3 .Cursor = self .conn .cursor ()
138+
139+ # Retrieve predictions from database
140+ cursor .execute ("SELECT value FROM NGramPrediction WHERE name='ngrampredictions'" )
141+ predictions_str : str = cursor .fetchone ()[0 ]
142+ predictions : Dict [Tuple [str ], Dict [str , Union [str , int ]]] = json .loads (predictions_str , object_hook = lambda d : {tuple (k .split ()): v for k , v in d .items ()})
143+
144+ input_words = words .lower ().split ()
145+ for i in range (len (input_words ), max (0 , len (input_words ) - self .n + 1 ), - 1 ):
146+ curr_ngram = tuple (input_words [i - self .n :i ])
147+ if curr_ngram in predictions :
148+ return str (predictions [curr_ngram ]['completion_word' ])
149+ return None
150+
151+
152+ if __name__ == "__main__" :
153+ # Example usage
154+ training_sentence : str = (
155+ "It is not enough to just know how tools work and what they worth, "
156+ "we have got to learn how to use them and to use them well. And with "
157+ "all these new weapons in your arsenal, we would better get those profits fired up"
158+ )
159+
160+ # Initialize and train the autocomplete system
161+ autocomplete : AutoComplete = AutoComplete (n = 2 )
162+ autocomplete .train (training_sentence )
163+
164+ # Test prediction
165+ test_words : str = "to use"
166+ prediction : Optional [str ] = autocomplete .predict (test_words )
167+ print (f"Prediction for '{ test_words } ': { prediction } " )
0 commit comments