Skip to content

Commit 3318554

Browse files
committed
[up]: add text preprocessing and detect_report method directly to text detector
1 parent 8e9f2d8 commit 3318554

File tree

3 files changed

+156
-7
lines changed

3 files changed

+156
-7
lines changed

generated_text_detector/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
beautifulsoup4==4.12.3
12
fastapi==0.110.0
3+
Markdown==3.7
24
numpy==1.25.2
35
nltk==3.8.1
46
starlette==0.36.3
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import re
2+
3+
from bs4 import BeautifulSoup
4+
import markdown
5+
6+
7+
URL_PATTERN = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
8+
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
9+
HOMOGLYPH_MAP = {
10+
# Cyrillic letters
11+
'А': 'A', # U+0410 Cyrillic Capital Letter A
12+
'В': 'B', # U+0412 Cyrillic Capital Letter Ve
13+
'Е': 'E', # U+0415 Cyrillic Capital Letter Ie
14+
'К': 'K', # U+041A Cyrillic Capital Letter Ka
15+
'М': 'M', # U+041C Cyrillic Capital Letter Em
16+
'Н': 'H', # U+041D Cyrillic Capital Letter En
17+
'О': 'O', # U+041E Cyrillic Capital Letter O
18+
'Р': 'P', # U+0420 Cyrillic Capital Letter Er
19+
'С': 'C', # U+0421 Cyrillic Capital Letter Es
20+
'Т': 'T', # U+0422 Cyrillic Capital Letter Te
21+
'Х': 'X', # U+0425 Cyrillic Capital Letter Ha
22+
'а': 'a', # U+0430 Cyrillic Small Letter A
23+
'е': 'e', # U+0435 Cyrillic Small Letter Ie
24+
'о': 'o', # U+043E Cyrillic Small Letter O
25+
'р': 'p', # U+0440 Cyrillic Small Letter Er
26+
'с': 'c', # U+0441 Cyrillic Small Letter Es
27+
'у': 'y', # U+0443 Cyrillic Small Letter U
28+
'х': 'x', # U+0445 Cyrillic Small Letter Ha
29+
'І': 'I', # U+0406 Cyrillic Capital Letter Byelorussian-Ukrainian I
30+
'і': 'i', # U+0456 Cyrillic Small Letter Byelorussian-Ukrainian I
31+
# Greek letters
32+
'Α': 'A', # U+0391 Greek Capital Letter Alpha
33+
'Β': 'B', # U+0392 Greek Capital Letter Beta
34+
'Ε': 'E', # U+0395 Greek Capital Letter Epsilon
35+
'Ζ': 'Z', # U+0396 Greek Capital Letter Zeta
36+
'Η': 'H', # U+0397 Greek Capital Letter Eta
37+
'Ι': 'I', # U+0399 Greek Capital Letter Iota
38+
'Κ': 'K', # U+039A Greek Capital Letter Kappa
39+
'Μ': 'M', # U+039C Greek Capital Letter Mu
40+
'Ν': 'N', # U+039D Greek Capital Letter Nu
41+
'Ο': 'O', # U+039F Greek Capital Letter Omicron
42+
'Ρ': 'P', # U+03A1 Greek Capital Letter Rho
43+
'Τ': 'T', # U+03A4 Greek Capital Letter Tau
44+
'Υ': 'Y', # U+03A5 Greek Capital Letter Upsilon
45+
'Χ': 'X', # U+03A7 Greek Capital Letter Chi
46+
'Ϲ': 'C', # U+03F9 Greek Capital Lunate Sigma
47+
'а': 'a', # U+03B1 Greek Small Letter Alpha
48+
'ο': 'o', # U+03BF Greek Small Letter Omicron
49+
'с': 'c', # U+03F2 Greek Lunate Sigma Symbol
50+
}
51+
52+
53+
def preprocessing_text(text: str) -> str:
54+
55+
# Replace zero-width spaces with normal spaces
56+
text = text.replace('\u200B', '')
57+
58+
# Replace homoglyphs with ASCII equivalents
59+
text = ''.join(HOMOGLYPH_MAP.get(char, char) for char in text)
60+
61+
# Remove markdown
62+
html = markdown.markdown(text)
63+
64+
# Use BeautifulSoup to extract text from HTML
65+
soup = BeautifulSoup(html, 'html.parser')
66+
text = soup.get_text()
67+
68+
# Remove URLs and EMAILs
69+
text = URL_PATTERN.sub('', text)
70+
text = EMAIL_PATTERN.sub('', text)
71+
72+
text = " ".join(text.split())
73+
74+
return text.strip()
75+
76+
77+
if __name__ == "__main__":
78+
sample = """
79+
Hello, world!
80+
It's а test of preproccesing funсtion
81+
Google search url: https://www.google.com/
82+
"""
83+
84+
print(preprocessing_text(sample))

generated_text_detector/utils/text_detector.py

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from nltk.tokenize import sent_tokenize
44
from transformers import RobertaTokenizer
55

6+
from generated_text_detector.controllers.schemas_type import Author
7+
from generated_text_detector.utils.preprocessing import preprocessing_text
68
from generated_text_detector.utils.model.roberta_classifier import RobertaClassifier
79

810

@@ -21,6 +23,7 @@ def __init__(
2123
model_name_or_path: str,
2224
device: str,
2325
max_len: int = 512,
26+
preprocessing: bool = False
2427
) -> None:
2528

2629
self.device = torch.device(device)
@@ -30,6 +33,7 @@ def __init__(
3033
self.model.eval()
3134

3235
self.__max_len = max_len
36+
self.preprocessing = preprocessing
3337

3438
# Optimizing GPU inference
3539
if self.device.type == 'cuda':
@@ -60,14 +64,14 @@ def __split_by_chunks(self, text: str) -> list[str]:
6064
for sentence in sent_tokenize(text):
6165
temp_count_tokens = len(self.tokenizer.encode(sentence))
6266
if cur_count_tokens + temp_count_tokens > self.__max_len:
63-
chunks.append(cur_chunk)
67+
chunks.append(cur_chunk.strip())
6468
cur_chunk = sentence
6569
cur_count_tokens = temp_count_tokens
6670
else:
6771
cur_count_tokens += temp_count_tokens
6872
cur_chunk += " " + sentence
6973

70-
chunks.append(cur_chunk)
74+
chunks.append(cur_chunk.strip())
7175

7276
return chunks
7377

@@ -80,9 +84,6 @@ def __model_pass(self, texts: list[str]) -> list[float]:
8084
:return: List of scores
8185
:rtype: list[float]
8286
"""
83-
# Preprocessing
84-
texts = [" ".join(text.split()) for text in texts]
85-
8687
tokens = self.tokenizer.batch_encode_plus(
8788
texts,
8889
add_special_tokens=True,
@@ -111,6 +112,12 @@ def detect(self, text: str) -> list[tuple[str, float]]:
111112
:return: Text chunks with generated scores
112113
:rtype: list[tuple[str, float]]
113114
"""
115+
# Preprocessing
116+
if self.preprocessing:
117+
text = preprocessing_text(text)
118+
else:
119+
text = " ".join(text.split())
120+
114121
text_chunks = self.__split_by_chunks(text)
115122

116123
scores = self.__model_pass(text_chunks).tolist()
@@ -120,12 +127,68 @@ def detect(self, text: str) -> list[tuple[str, float]]:
120127
return res
121128

122129

130+
def detect_report(self, text: str) -> dict:
131+
"""Detects if text is generated and prepare a report.
132+
133+
:param text: Input text
134+
:type text: str
135+
:return: Text chunks with generated scores
136+
:rtype: list[tuple[str, float]]
137+
"""
138+
# Preprocessing
139+
if self.preprocessing:
140+
text = preprocessing_text(text)
141+
else:
142+
text = " ".join(text.split())
143+
144+
text_chunks = self.__split_by_chunks(text)
145+
scores = self.__model_pass(text_chunks)
146+
147+
# Average scores
148+
gen_score = sum(scores) / len(scores)
149+
gen_score = gen_score.item()
150+
author = self.__determine_author(gen_score).value
151+
152+
res = {
153+
"generated_score": gen_score,
154+
"author": author
155+
}
156+
157+
return res
158+
159+
160+
@staticmethod
161+
def __determine_author(generated_score: float) -> Author:
162+
"""Function for converting score for final prediction
163+
The generated score is compared with heuristics obtained from analysis on validation data
164+
As a result, we get 5 categories described in the `Author` class
165+
166+
:param text: Generated score from detector model
167+
:type text: float, should be from 0 to 1
168+
:return: Final prediction athor
169+
:rtype: Autrhor
170+
"""
171+
assert 0 <= generated_score <= 1
172+
173+
if generated_score > 0.9:
174+
return Author.LLM_GENERATED
175+
elif generated_score > 0.7:
176+
return Author.PROBABLY_LLM_GENERATED
177+
elif generated_score > 0.3:
178+
return Author.NOT_SURE
179+
elif generated_score > 0.1:
180+
return Author.PROBABLY_HUMAN_WRITTEN
181+
else:
182+
return Author.HUMAN
183+
184+
123185
if __name__ == "__main__":
124186
detector = GeneratedTextDetector(
125187
"SuperAnnotate/ai-detector",
126-
"cuda:0"
188+
"cuda:0",
189+
preprocessing=True
127190
)
128191

129-
res = detector.detect("Hello, world!")
192+
res = detector.detect_report("Hello, world!")
130193

131194
print(res)

0 commit comments

Comments
 (0)