Skip to content

Commit c5fe87b

Browse files
committed
code cleanup
1 parent ac6a795 commit c5fe87b

22 files changed

+206
-203
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,9 @@ Training process Example | Interpolation between images Example
675675
>Example №5
676676
*a beautiful portrait painting of a cyberpunk city by simon stalenhag and pascal blanche and alphonse mucha, in style of colorful comic. symmetry, hyper detailed. octanev render. trending on artstation*
677677

678+
Code:
679+
*[Model example](examples/gpt.py)*
680+
678681
</details>
679682

680683
<details>

data_loader.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import os
2-
import tarfile
32
import zipfile
43
from pathlib import Path
54

examples/gpt.py

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,19 @@
11
import math
22
from pathlib import Path
3+
from typing import Optional
34

5+
import matplotlib.pyplot as plt
46
import numpy as np
57
from tokenizers.implementations import ByteLevelBPETokenizer
68
from tokenizers.processors import TemplateProcessing
79
from tqdm import tqdm
810

911
import neunet
1012
import neunet.nn as nn
11-
from datasets import load_dataset
13+
from datasets import load_dataset # type: ignore
1214
from neunet import Tensor
1315
from neunet.optim import Adam
1416

15-
import matplotlib.pyplot as plt
1617

1718
class MultiHeadAttention(nn.Module):
1819
def __init__(self, d_model, n_heads, dropout=0.1):
@@ -22,7 +23,8 @@ def __init__(self, d_model, n_heads, dropout=0.1):
2223
self.scale = math.sqrt(d_model)
2324
self.dropout = nn.Dropout(dropout)
2425

25-
assert d_model % n_heads == 0
26+
if d_model % n_heads != 0:
27+
raise ValueError("d_model must be divisible by n_heads")
2628

2729
self.depth = d_model // n_heads
2830

@@ -33,7 +35,7 @@ def __init__(self, d_model, n_heads, dropout=0.1):
3335
self.fc = nn.Linear(d_model, d_model)
3436

3537

36-
def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor=None):
38+
def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor]=None):
3739
batch_size = q.shape[0]
3840
q = self.wq(q).contiguous().reshape(batch_size, -1, self.n_heads, self.depth).transpose(0, 2, 1, 3)
3941
k = self.wk(k).contiguous().reshape(batch_size, -1, self.n_heads, self.depth).transpose(0, 2, 1, 3)
@@ -191,21 +193,20 @@ def forward(self, x) -> tuple[Tensor, Tensor]:
191193

192194
for split, split_dataset in data.items():
193195

194-
with open(f"./datasets/sd-prompts/sd-prompts-{split}.txt", 'w', encoding='utf-8') as f:
196+
with Path(f"./datasets/sd-prompts/sd-prompts-{split}.txt").open('w', encoding='utf-8') as f:
195197
for item in split_dataset:
196198
f.write(item['Prompt'] + '\n')
197199

198200

199201
FILE_PATHS = [DATASET_PATH / "sd-prompts-train.txt", DATASET_PATH / "sd-prompts-test.txt"]
200-
FILE_PATHS = [str(path) for path in FILE_PATHS]
201202

202203

203204
# [Train and load Tokenizer]
204205

205206
if not (SAVE_PATH / "vocab").exists():
206207
tokenizer = ByteLevelBPETokenizer()
207208

208-
tokenizer.train(files=FILE_PATHS, vocab_size=15000, min_frequency=1, special_tokens=[
209+
tokenizer.train(files=[str(path) for path in FILE_PATHS], vocab_size=15000, min_frequency=1, special_tokens=[
209210
PAD_TOKEN,
210211
SOS_TOKEN,
211212
EOS_TOKEN,
@@ -231,7 +232,7 @@ class DataPreprocessor():
231232
def __init__(self, tokenizer: ByteLevelBPETokenizer):
232233
self.tokenizer = tokenizer
233234

234-
self.tokenizer._tokenizer.post_processor = TemplateProcessing(
235+
self.tokenizer._tokenizer.post_processor = TemplateProcessing( # noqa SLF001
235236
single=f"{SOS_TOKEN} $A {EOS_TOKEN}",
236237
special_tokens=[
237238
(f"{SOS_TOKEN}", tokenizer.token_to_id(f"{SOS_TOKEN}")),
@@ -242,13 +243,13 @@ def __init__(self, tokenizer: ByteLevelBPETokenizer):
242243
# self.tokenizer.enable_truncation(max_length=151)
243244
self.tokenizer.enable_padding(pad_token = PAD_TOKEN)
244245

245-
def tokenize(self, paths: list[str], batch_size: int, lines_limit: int = None) -> np.ndarray:
246+
def tokenize(self, paths: list[str], batch_size: int, lines_limit: Optional[int] = None) -> list[np.ndarray]:
246247
examples = []
247248

248249
for src_file in paths:
249250
print(f"Processing {src_file}")
250-
src_file = Path(src_file)
251-
lines = src_file.read_text(encoding="utf-8").splitlines()
251+
path_src_file = Path(src_file)
252+
lines = path_src_file.read_text(encoding="utf-8").splitlines()
252253

253254
if lines_limit:
254255
lines = lines[:lines_limit]
@@ -259,13 +260,13 @@ def tokenize(self, paths: list[str], batch_size: int, lines_limit: int = None) -
259260

260261
return examples
261262

262-
def __call__(self, paths: list[str], batch_size: int, lines_limit: int = None) -> np.ndarray:
263+
def __call__(self, paths: list[str], batch_size: int, lines_limit: Optional[int] = None) -> list[np.ndarray]:
263264
return self.tokenize(paths, batch_size, lines_limit)
264265

265266
data_post_processor = DataPreprocessor(tokenizer)
266267

267-
train_data = data_post_processor([FILE_PATHS[0]], batch_size = BATCH_SIZE, lines_limit=20000)
268-
val_data = data_post_processor([FILE_PATHS[1]], batch_size = BATCH_SIZE, lines_limit=2000)
268+
train_data = data_post_processor([str(FILE_PATHS[0])], batch_size = BATCH_SIZE, lines_limit=20000)
269+
val_data = data_post_processor([str(FILE_PATHS[1])], batch_size = BATCH_SIZE, lines_limit=2000)
269270

270271

271272

@@ -300,7 +301,7 @@ def __call__(self, paths: list[str], batch_size: int, lines_limit: int = None) -
300301

301302
# [train, eval, predict methods definition]
302303

303-
def train_step(dataset: np.ndarray, epoch: int, epochs: int) -> float:
304+
def train_step(dataset: list[np.ndarray], epoch: int, epochs: int) -> float:
304305
loss_history = []
305306
model.train()
306307

@@ -333,7 +334,7 @@ def train_step(dataset: np.ndarray, epoch: int, epochs: int) -> float:
333334

334335
return epoch_loss
335336

336-
def eval(dataset: np.ndarray) -> float:
337+
def eval(dataset: list[np.ndarray]) -> float:
337338
loss_history = []
338339
model.eval()
339340

@@ -361,7 +362,7 @@ def eval(dataset: np.ndarray) -> float:
361362
return epoch_loss
362363

363364

364-
def train(train_data: np.ndarray, val_data: np.ndarray, epochs: int, save_every_epochs: int, save_path: str = None, validation_check: bool = False):
365+
def train(train_data: list[np.ndarray], val_data: list[np.ndarray], epochs: int, save_every_epochs: int, save_path: Optional[str] = None, validation_check: bool = False):
365366
best_val_loss = float('inf')
366367

367368
train_loss_history = []
@@ -395,7 +396,7 @@ def train(train_data: np.ndarray, val_data: np.ndarray, epochs: int, save_every_
395396
def predict(sentence: str = "", max_length: int = 50, temperature: float = 0.7) -> tuple[str, Tensor]:
396397
model.eval()
397398

398-
tokens: list = [SOS_INDEX] + tokenizer.encode(sentence, add_special_tokens=False).ids
399+
tokens: list = [SOS_INDEX, *tokenizer.encode(sentence, add_special_tokens=False).ids]
399400

400401
for _ in range(max_length):
401402
inputs = np.asarray(tokens).reshape(1, -1)

examples/seq2seq.py

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import math
22
from pathlib import Path
3+
from typing import Optional
34

45
import matplotlib.pyplot as plt
56
import numpy as np
@@ -9,7 +10,7 @@
910

1011
import neunet
1112
import neunet.nn as nn
12-
from datasets import load_dataset
13+
from datasets import load_dataset # type: ignore
1314
from neunet import Tensor
1415
from neunet.optim import Adam
1516

@@ -27,7 +28,8 @@ def __init__(self, d_model, n_heads, dropout=0.1):
2728
self.scale = math.sqrt(d_model)
2829
self.dropout = nn.Dropout(dropout)
2930

30-
assert d_model % n_heads == 0
31+
if d_model % n_heads != 0:
32+
raise ValueError("d_model must be divisible by n_heads")
3133

3234
self.depth = d_model // n_heads
3335

@@ -37,7 +39,7 @@ def __init__(self, d_model, n_heads, dropout=0.1):
3739

3840
self.fc = nn.Linear(d_model, d_model)
3941

40-
def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor=None):
42+
def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor]=None):
4143
batch_size = q.shape[0]
4244
q = self.wq(q).contiguous().reshape(batch_size, -1, self.n_heads, self.depth).transpose(0, 2, 1, 3)
4345
k = self.wk(k).contiguous().reshape(batch_size, -1, self.n_heads, self.depth).transpose(0, 2, 1, 3)
@@ -246,7 +248,7 @@ def forward(self, src: np.ndarray, tgt: np.ndarray) -> tuple[Tensor, Tensor]:
246248
PAD_TOKEN = '<pad>' # noqa: S105
247249
SOS_TOKEN = '<sos>' # noqa: S105
248250
EOS_TOKEN = '<eos>' # noqa: S105
249-
# UNK_TOKEN = '<unk>' # noqa: S105
251+
# UNK_TOKEN = '<unk>'
250252

251253
DATASET_PATH = Path("./datasets/multi30k/")
252254
SAVE_PATH = Path("./saved models/seq2seq/")
@@ -255,23 +257,22 @@ def forward(self, src: np.ndarray, tgt: np.ndarray) -> tuple[Tensor, Tensor]:
255257
data = load_dataset("bentrevett/multi30k", cache_dir="datasets/multi30k")
256258

257259
for split, split_dataset in data.items():
258-
with open(f"./datasets/multi30k/{split}.en", 'w', encoding='utf-8') as f:
260+
with Path(f"./datasets/multi30k/{split}.en").open('w', encoding='utf-8') as f:
259261
for item in split_dataset:
260262
f.write(item['en'] + '\n')
261263

262-
with open(f"./datasets/multi30k/{split}.de", 'w', encoding='utf-8') as f:
264+
with Path(f"./datasets/multi30k/{split}.de").open('w', encoding='utf-8') as f:
263265
for item in split_dataset:
264266
f.write(item['de'] + '\n')
265267

266268
FILE_PATHS = [DATASET_PATH / "train.en", DATASET_PATH / "train.de", DATASET_PATH / "val.en", DATASET_PATH / "val.de", DATASET_PATH / "test.en", DATASET_PATH / "test.de"]
267-
FILE_PATHS = [str(path) for path in FILE_PATHS]
268269

269270

270271
# [Train and load Tokenizer]
271272
if not (SAVE_PATH / "vocab").exists():
272273
tokenizer = ByteLevelBPETokenizer()
273274

274-
tokenizer.train(files=FILE_PATHS, vocab_size=15000, min_frequency=1, special_tokens=[
275+
tokenizer.train(files=[str(path) for path in FILE_PATHS], vocab_size=15000, min_frequency=1, special_tokens=[
275276
PAD_TOKEN,
276277
SOS_TOKEN,
277278
EOS_TOKEN,
@@ -298,7 +299,7 @@ class DataPreprocessor():
298299
def __init__(self, tokenizer: ByteLevelBPETokenizer):
299300
self.tokenizer = tokenizer
300301

301-
self.tokenizer._tokenizer.post_processor = TemplateProcessing(
302+
self.tokenizer._tokenizer.post_processor = TemplateProcessing( # noqa SLF001
302303
single=f"{SOS_TOKEN} $A {EOS_TOKEN}",
303304
special_tokens=[
304305
(f"{SOS_TOKEN}", tokenizer.token_to_id(f"{SOS_TOKEN}")),
@@ -309,13 +310,13 @@ def __init__(self, tokenizer: ByteLevelBPETokenizer):
309310
# self.tokenizer.enable_truncation(max_length=128)
310311
self.tokenizer.enable_padding(pad_token = PAD_TOKEN)
311312

312-
def tokenize(self, paths: list[str], batch_size: int, lines_limit: int = None) -> np.ndarray:
313+
def tokenize(self, paths: list[str], batch_size: int, lines_limit: Optional[int] = None) -> list[np.ndarray]:
313314
examples = []
314315

315316
for src_file in paths:
316317
print(f"Processing {src_file}")
317-
src_file = Path(src_file)
318-
lines = src_file.read_text(encoding="utf-8").splitlines()
318+
path_src_file = Path(src_file)
319+
lines = path_src_file.read_text(encoding="utf-8").splitlines()
319320

320321
if lines_limit:
321322
lines = lines[:lines_limit]
@@ -326,20 +327,20 @@ def tokenize(self, paths: list[str], batch_size: int, lines_limit: int = None) -
326327

327328
return examples
328329

329-
def __call__(self, paths: list[str], batch_size: int, lines_limit: int = None) -> np.ndarray:
330+
def __call__(self, paths: list[str], batch_size: int, lines_limit: Optional[int] = None) -> list[np.ndarray]:
330331
return self.tokenize(paths, batch_size, lines_limit)
331332

332333

333334
data_post_processor = DataPreprocessor(tokenizer)
334335

335-
train_src = data_post_processor([DATASET_PATH / "train.en"], batch_size = BATCH_SIZE)
336-
train_tgt = data_post_processor([DATASET_PATH / "train.de"], batch_size = BATCH_SIZE)
336+
train_src = data_post_processor([str(DATASET_PATH / "train.en")], batch_size = BATCH_SIZE)
337+
train_tgt = data_post_processor([str(DATASET_PATH / "train.de")], batch_size = BATCH_SIZE)
337338

338-
val_src = data_post_processor([DATASET_PATH / "val.en"], batch_size = BATCH_SIZE)
339-
val_tgt = data_post_processor([DATASET_PATH / "val.de"], batch_size = BATCH_SIZE)
339+
val_src = data_post_processor([str(DATASET_PATH / "val.en")], batch_size = BATCH_SIZE)
340+
val_tgt = data_post_processor([str(DATASET_PATH / "val.de")], batch_size = BATCH_SIZE)
340341

341-
test_src = data_post_processor([DATASET_PATH / "test.en"], batch_size = BATCH_SIZE)
342-
test_tgt = data_post_processor([DATASET_PATH / "test.de"], batch_size = BATCH_SIZE)
342+
test_src = data_post_processor([str(DATASET_PATH / "test.en")], batch_size = BATCH_SIZE)
343+
test_tgt = data_post_processor([str(DATASET_PATH / "test.de")], batch_size = BATCH_SIZE)
343344

344345

345346
train_data = train_src, train_tgt
@@ -386,11 +387,11 @@ def __call__(self, paths: list[str], batch_size: int, lines_limit: int = None) -
386387

387388
# [train, eval, predict methods definition]
388389

389-
def train_step(source: np.ndarray, target: np.ndarray, epoch: int, epochs: int) -> float:
390+
def train_step(source: list[np.ndarray], target: list[np.ndarray], epoch: int, epochs: int) -> float:
390391
loss_history = []
391392
model.train()
392393

393-
tqdm_range = tqdm(enumerate(zip(source, target)), total = len(source))
394+
tqdm_range = tqdm(enumerate(zip(source, target, strict=False)), total = len(source))
394395
for batch_num, (source_batch, target_batch) in tqdm_range:
395396

396397
output, _ = model.forward(source_batch, target_batch[:,:-1])
@@ -419,11 +420,11 @@ def train_step(source: np.ndarray, target: np.ndarray, epoch: int, epochs: int)
419420

420421
return epoch_loss
421422

422-
def eval(source: np.ndarray, target: np.ndarray) -> float:
423+
def eval(source: list[np.ndarray], target: list[np.ndarray]) -> float:
423424
loss_history = []
424425
model.eval()
425426

426-
tqdm_range = tqdm(enumerate(zip(source, target)), total = len(source))
427+
tqdm_range = tqdm(enumerate(zip(source, target, strict=False)), total = len(source))
427428
for batch_num, (source_batch, target_batch) in tqdm_range:
428429

429430
output, _ = model.forward(source_batch, target_batch[:,:-1])
@@ -447,7 +448,7 @@ def eval(source: np.ndarray, target: np.ndarray) -> float:
447448
return epoch_loss
448449

449450

450-
def train(train_data: np.ndarray, val_data: np.ndarray, epochs: int, save_every_epochs: int, save_path: str = None, validation_check: bool = False):
451+
def train(train_data: tuple[list[np.ndarray], list[np.ndarray]], val_data: tuple[list[np.ndarray], list[np.ndarray]], epochs: int, save_every_epochs: int, save_path: Optional[str] = None, validation_check: bool = False):
451452
best_val_loss = float('inf')
452453

453454
train_loss_history = []
@@ -547,23 +548,22 @@ def plot_loss_history(train_loss_history, val_loss_history):
547548

548549

549550

550-
test_data = []
551+
raw_test_data: list[dict[str, str]] = []
551552

552-
with open(DATASET_PATH / "test.en", 'r') as f:
553-
en_file = [l.strip() for l in open(DATASET_PATH / "test.en", 'r', encoding='utf-8')]
554-
de_file = [l.strip() for l in open(DATASET_PATH / "test.de", 'r', encoding='utf-8')]
553+
en_file = [l.strip() for l in Path(DATASET_PATH / "test.en").open('r', encoding='utf-8')]
554+
de_file = [l.strip() for l in Path(DATASET_PATH / "test.de").open('r', encoding='utf-8')]
555555

556556
for i in range(len(en_file)):
557557
if en_file[i] == '' or de_file[i] == '':
558558
continue
559559
en_seq, de_seq = en_file[i], de_file[i]
560560

561-
test_data.append({'en': en_seq, 'de': de_seq})
561+
raw_test_data.append({'en': en_seq, 'de': de_seq})
562562

563563
sentences_num = 10
564564

565-
random_indices = np.random.randint(0, len(test_data), sentences_num)
566-
sentences_selection = [test_data[i] for i in random_indices]
565+
random_indices = np.random.randint(0, len(raw_test_data), sentences_num)
566+
sentences_selection = [raw_test_data[i] for i in random_indices]
567567

568568
# [Translate sentences from validation set]
569569
for i, example in enumerate(sentences_selection):
@@ -575,7 +575,8 @@ def plot_loss_history(train_loss_history, val_loss_history):
575575

576576

577577
def plot_attention(sentence: str, translation: str, attention: Tensor, heads_num: int = 8, rows_num: int = 2, cols_num: int = 4):
578-
assert rows_num * cols_num == heads_num
578+
if rows_num * cols_num != heads_num:
579+
raise ValueError("heads_num must be equal to rows_num * cols_num")
579580
attention = attention.detach().cpu().numpy().squeeze()
580581

581582
sentence = tokenizer.encode(sentence, add_special_tokens=False).tokens

0 commit comments

Comments
 (0)