Skip to content

Commit 5c475db

Browse files
authored
🚨 breaking: Fix training with special tokens (#1617)
* fix training * fmt * clippy
1 parent be25814 commit 5c475db

File tree

1 file changed

+7
-2
lines changed
  • tokenizers/src/tokenizer

1 file changed

+7
-2
lines changed

tokenizers/src/tokenizer/mod.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,6 +1128,7 @@ where
11281128
}
11291129
}
11301130

1131+
#[allow(dead_code)]
11311132
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
11321133
where
11331134
N: Normalizer,
@@ -1388,7 +1389,9 @@ where
13881389
}
13891390
}),
13901391
|seq| {
1391-
let normalized = self.do_normalize(seq.as_ref())?;
1392+
let normalized = self
1393+
.added_vocabulary
1394+
.extract_and_normalize(self.normalizer.as_ref(), seq.as_ref());
13921395
let pre_tokenized = self.do_pre_tokenize(normalized)?;
13931396
Ok(pre_tokenized
13941397
.get_splits(OffsetReferential::Original, OffsetType::Byte)
@@ -1439,7 +1442,9 @@ where
14391442
}
14401443
}),
14411444
|seq| {
1442-
let normalized = self.do_normalize(seq.as_ref())?;
1445+
let normalized = self
1446+
.added_vocabulary
1447+
.extract_and_normalize(self.normalizer.as_ref(), seq.as_ref());
14431448
let pre_tokenized = self.do_pre_tokenize(normalized)?;
14441449
Ok(pre_tokenized
14451450
.get_splits(OffsetReferential::Original, OffsetType::Byte)

0 commit comments

Comments
 (0)