[up]: add text preprocessing and detect_report method directly to text detector

SA-Yur-or · SA-Yur-or · commit 3318554ca0f4 · 2024-12-17T16:31:52.000+04:00
diff --git a/generated_text_detector/requirements.txt b/generated_text_detector/requirements.txt
@@ -1,4 +1,6 @@
+beautifulsoup4==4.12.3
 fastapi==0.110.0
+Markdown==3.7
 numpy==1.25.2
 nltk==3.8.1
 starlette==0.36.3
diff --git a/generated_text_detector/utils/preprocessing.py b/generated_text_detector/utils/preprocessing.py
@@ -0,0 +1,84 @@
+import re
+
+from bs4 import BeautifulSoup
+import markdown
+
+
+URL_PATTERN = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
+EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
+HOMOGLYPH_MAP = {
+    # Cyrillic letters
+    'А': 'A',  # U+0410 Cyrillic Capital Letter A
+    'В': 'B',  # U+0412 Cyrillic Capital Letter Ve
+    'Е': 'E',  # U+0415 Cyrillic Capital Letter Ie
+    'К': 'K',  # U+041A Cyrillic Capital Letter Ka
+    'М': 'M',  # U+041C Cyrillic Capital Letter Em
+    'Н': 'H',  # U+041D Cyrillic Capital Letter En
+    'О': 'O',  # U+041E Cyrillic Capital Letter O
+    'Р': 'P',  # U+0420 Cyrillic Capital Letter Er
+    'С': 'C',  # U+0421 Cyrillic Capital Letter Es
+    'Т': 'T',  # U+0422 Cyrillic Capital Letter Te
+    'Х': 'X',  # U+0425 Cyrillic Capital Letter Ha
+    'а': 'a',  # U+0430 Cyrillic Small Letter A
+    'е': 'e',  # U+0435 Cyrillic Small Letter Ie
+    'о': 'o',  # U+043E Cyrillic Small Letter O
+    'р': 'p',  # U+0440 Cyrillic Small Letter Er
+    'с': 'c',  # U+0441 Cyrillic Small Letter Es
+    'у': 'y',  # U+0443 Cyrillic Small Letter U
+    'х': 'x',  # U+0445 Cyrillic Small Letter Ha
+    'І': 'I',  # U+0406 Cyrillic Capital Letter Byelorussian-Ukrainian I
+    'і': 'i',  # U+0456 Cyrillic Small Letter Byelorussian-Ukrainian I
+    # Greek letters
+    'Α': 'A',  # U+0391 Greek Capital Letter Alpha
+    'Β': 'B',  # U+0392 Greek Capital Letter Beta
+    'Ε': 'E',  # U+0395 Greek Capital Letter Epsilon
+    'Ζ': 'Z',  # U+0396 Greek Capital Letter Zeta
+    'Η': 'H',  # U+0397 Greek Capital Letter Eta
+    'Ι': 'I',  # U+0399 Greek Capital Letter Iota
+    'Κ': 'K',  # U+039A Greek Capital Letter Kappa
+    'Μ': 'M',  # U+039C Greek Capital Letter Mu
+    'Ν': 'N',  # U+039D Greek Capital Letter Nu
+    'Ο': 'O',  # U+039F Greek Capital Letter Omicron
+    'Ρ': 'P',  # U+03A1 Greek Capital Letter Rho
+    'Τ': 'T',  # U+03A4 Greek Capital Letter Tau
+    'Υ': 'Y',  # U+03A5 Greek Capital Letter Upsilon
+    'Χ': 'X',  # U+03A7 Greek Capital Letter Chi
+    'Ϲ': 'C',  # U+03F9 Greek Capital Lunate Sigma
+    'а': 'a',  # U+03B1 Greek Small Letter Alpha
+    'ο': 'o',  # U+03BF Greek Small Letter Omicron
+    'с': 'c',  # U+03F2 Greek Lunate Sigma Symbol
+}
+
+
+def preprocessing_text(text: str) -> str:
+
+        # Replace zero-width spaces with normal spaces
+        text = text.replace('\u200B', '')
+
+        # Replace homoglyphs with ASCII equivalents
+        text = ''.join(HOMOGLYPH_MAP.get(char, char) for char in text)
+
+        # Remove markdown
+        html = markdown.markdown(text)
+
+        # Use BeautifulSoup to extract text from HTML
+        soup = BeautifulSoup(html, 'html.parser')
+        text = soup.get_text()
+    
+        # Remove URLs and EMAILs
+        text = URL_PATTERN.sub('', text)
+        text = EMAIL_PATTERN.sub('', text) 
+
+        text = " ".join(text.split())
+
+        return text.strip()
+
+
+if __name__ == "__main__":
+    sample = """
+Hello,   world!
+It's а test of preproccesing funсtion
+Google search url: https://www.google.com/
+"""
+
+    print(preprocessing_text(sample))
diff --git a/generated_text_detector/utils/text_detector.py b/generated_text_detector/utils/text_detector.py
@@ -3,6 +3,8 @@
 from nltk.tokenize import sent_tokenize
 from transformers import RobertaTokenizer
 
+from generated_text_detector.controllers.schemas_type import Author
+from generated_text_detector.utils.preprocessing import preprocessing_text
 from generated_text_detector.utils.model.roberta_classifier import RobertaClassifier
 
 
@@ -21,6 +23,7 @@ def __init__(
         model_name_or_path: str,
         device: str,
         max_len: int = 512,
+        preprocessing: bool = False
     ) -> None:
         
         self.device = torch.device(device)
@@ -30,6 +33,7 @@ def __init__(
         self.model.eval()
 
         self.__max_len = max_len
+        self.preprocessing = preprocessing
 
         # Optimizing GPU inference
         if self.device.type == 'cuda':
@@ -60,14 +64,14 @@ def __split_by_chunks(self, text: str) -> list[str]:
         for sentence in sent_tokenize(text):
             temp_count_tokens = len(self.tokenizer.encode(sentence))
             if cur_count_tokens + temp_count_tokens > self.__max_len:
-                chunks.append(cur_chunk)
+                chunks.append(cur_chunk.strip())
                 cur_chunk = sentence
                 cur_count_tokens = temp_count_tokens
             else:
                 cur_count_tokens += temp_count_tokens
                 cur_chunk += " " + sentence
         
-        chunks.append(cur_chunk)
+        chunks.append(cur_chunk.strip())
 
         return chunks
 
@@ -80,9 +84,6 @@ def __model_pass(self, texts: list[str]) -> list[float]:
         :return: List of scores
         :rtype: list[float]
         """
-        # Preprocessing
-        texts = [" ".join(text.split()) for text in texts]
-
         tokens = self.tokenizer.batch_encode_plus(
             texts,
             add_special_tokens=True,
@@ -111,6 +112,12 @@ def detect(self, text: str) -> list[tuple[str, float]]:
         :return: Text chunks with generated scores
         :rtype: list[tuple[str, float]]
         """
+        # Preprocessing
+        if self.preprocessing:
+            text = preprocessing_text(text)
+        else:
+            text = " ".join(text.split())
+
         text_chunks = self.__split_by_chunks(text)
 
         scores = self.__model_pass(text_chunks).tolist()
@@ -120,12 +127,68 @@ def detect(self, text: str) -> list[tuple[str, float]]:
         return res
     
 
+    def detect_report(self, text: str) -> dict:
+        """Detects if text is generated and prepare a report.
+
+        :param text: Input text
+        :type text: str
+        :return: Text chunks with generated scores
+        :rtype: list[tuple[str, float]]
+        """
+        # Preprocessing
+        if self.preprocessing:
+            text = preprocessing_text(text)
+        else:
+            text = " ".join(text.split())
+
+        text_chunks = self.__split_by_chunks(text)
+        scores = self.__model_pass(text_chunks)
+
+        # Average scores
+        gen_score = sum(scores) / len(scores)
+        gen_score = gen_score.item() 
+        author = self.__determine_author(gen_score).value
+
+        res = {
+            "generated_score": gen_score,
+            "author": author
+        }
+       
+        return res
+
+
+    @staticmethod
+    def __determine_author(generated_score: float) -> Author:
+        """Function for converting score for final prediction
+        The generated score is compared with heuristics obtained from analysis on validation data
+        As a result, we get 5 categories described in the `Author` class
+        
+        :param text: Generated score from detector model
+        :type text: float, should be from 0 to 1
+        :return: Final prediction athor
+        :rtype: Autrhor
+        """
+        assert 0 <= generated_score <= 1
+
+        if generated_score > 0.9:
+            return Author.LLM_GENERATED
+        elif generated_score > 0.7:
+            return Author.PROBABLY_LLM_GENERATED
+        elif generated_score > 0.3:
+            return Author.NOT_SURE
+        elif generated_score > 0.1:
+            return Author.PROBABLY_HUMAN_WRITTEN
+        else:
+            return Author.HUMAN
+
+
 if __name__ == "__main__":
     detector = GeneratedTextDetector(
         "SuperAnnotate/ai-detector",
-        "cuda:0"
+        "cuda:0",
+        preprocessing=True
     )
 
-    res = detector.detect("Hello, world!")
+    res = detector.detect_report("Hello, world!")
 
     print(res)