Skip to content

Commit ed2cda5

Browse files
Implement from_bytes and read_bytes Methods in WordPiece Tokenizer for WebAssembly Compatibility (#1758)
* Add from_bytes and read_bytes method to WordPiece * Change wordpiece method return value --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
1 parent 9164247 commit ed2cda5

File tree

1 file changed

+17
-0
lines changed
  • tokenizers/src/models/wordpiece

1 file changed

+17
-0
lines changed

tokenizers/src/models/wordpiece/mod.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,23 @@ impl WordPiece {
172172
Ok(vocab)
173173
}
174174

175+
pub fn read_bytes(vocab: &[u8]) -> Result<Vocab> {
176+
let file = BufReader::new(vocab);
177+
178+
let mut vocab = HashMap::new();
179+
for (index, line) in file.lines().enumerate() {
180+
let line = line?;
181+
vocab.insert(line.trim_end().to_owned(), index as u32);
182+
}
183+
184+
Ok(vocab)
185+
}
186+
187+
pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self> {
188+
let tokenizer = serde_json::from_slice(bytes.as_ref())?;
189+
Ok(tokenizer)
190+
}
191+
175192
/// Initialize a `WordPiece` model from a vocab mapping file.
176193
pub fn from_file(vocab: &str) -> WordPieceBuilder {
177194
WordPiece::builder().files(vocab.to_owned())

0 commit comments

Comments
 (0)