File tree Expand file tree Collapse file tree 5 files changed +15
-8
lines changed Expand file tree Collapse file tree 5 files changed +15
-8
lines changed Original file line number Diff line number Diff line change 22authors = [" Nicolas Patry <nicolas@huggingface.co>" ]
33edition = " 2021"
44name = " node"
5- version = " 0.21.3 -dev.0"
5+ version = " 0.21.4 -dev.0"
66
77# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
88
Original file line number Diff line number Diff line change 11[package ]
22name = " tokenizers-python"
3- version = " 0.21.3 -dev.0"
3+ version = " 0.21.4 -dev.0"
44authors = [" Anthony MOI <m.anthony.moi@gmail.com>" ]
55edition = " 2021"
66
Original file line number Diff line number Diff line change 22authors = [" Anthony MOI <m.anthony.moi@gmail.com>" , " Nicolas Patry <patry.nicolas@protonmail.com>" ]
33edition = " 2018"
44name = " tokenizers"
5- version = " 0.21.3 -dev.0"
5+ version = " 0.21.4 -dev.0"
66homepage = " https://github.com/huggingface/tokenizers"
77repository = " https://github.com/huggingface/tokenizers"
88documentation = " https://docs.rs/tokenizers/"
Original file line number Diff line number Diff line change @@ -9,6 +9,7 @@ use compact_str::CompactString;
99use dary_heap:: OctonaryHeap ;
1010use serde:: { Deserialize , Serialize } ;
1111use std:: cmp:: Ordering ;
12+ use std:: collections:: HashSet ;
1213
1314#[ derive( Debug , Eq ) ]
1415struct Merge {
@@ -116,8 +117,10 @@ impl BpeTrainerBuilder {
116117
117118 /// Set the initial alphabet
118119 #[ must_use]
119- pub fn initial_alphabet ( mut self , alphabet : AHashSet < char > ) -> Self {
120- self . config . initial_alphabet = alphabet;
120+ pub fn initial_alphabet ( mut self , alphabet : HashSet < char > ) -> Self {
121+ let mut initial_alphabet = AHashSet :: with_capacity ( alphabet. len ( ) ) ;
122+ initial_alphabet. extend ( alphabet) ;
123+ self . config . initial_alphabet = initial_alphabet;
121124 self
122125 }
123126
Original file line number Diff line number Diff line change 1+ use std:: collections:: HashSet ;
2+
13use super :: WordPiece ;
24use crate :: models:: bpe:: { BpeTrainer , BpeTrainerBuilder , BPE } ;
35use crate :: tokenizer:: { AddedToken , Result , Trainer } ;
@@ -61,7 +63,7 @@ impl WordPieceTrainerBuilder {
6163
6264 /// Set the initial alphabet
6365 #[ must_use]
64- pub fn initial_alphabet ( mut self , alphabet : AHashSet < char > ) -> Self {
66+ pub fn initial_alphabet ( mut self , alphabet : HashSet < char > ) -> Self {
6567 self . bpe_trainer_builder = self . bpe_trainer_builder . initial_alphabet ( alphabet) ;
6668 self
6769 }
@@ -138,8 +140,10 @@ impl WordPieceTrainer {
138140 & self . bpe_trainer . initial_alphabet
139141 }
140142
141- pub fn set_initial_alphabet ( & mut self , alphabet : AHashSet < char > ) {
142- self . bpe_trainer . initial_alphabet = alphabet;
143+ pub fn set_initial_alphabet ( & mut self , alphabet : HashSet < char > ) {
144+ let mut initial_alphabet = AHashSet :: with_capacity ( alphabet. len ( ) ) ;
145+ initial_alphabet. extend ( alphabet) ;
146+ self . bpe_trainer . initial_alphabet = initial_alphabet;
143147 }
144148
145149 pub fn continuing_subword_prefix ( & self ) -> & Option < String > {
You can’t perform that action at this time.
0 commit comments