How to create vocab.txt ? [BERT][ELECTRA][Tokenizers]
pip install tokenizersfrom tokenizers import BertWordPieceTokenizerfrom glob import globimport json
txt_path = '/path/to/your/txts/*.txt'txts = glob(txt_path)
tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=False, strip_accents=False, lowercase=False, )
trainer = tokenizer.train( txts, vocab_size=32000, min_frequency=2, show_progress=True, # special_tokens=['', '', '', '', ''], limit_alphabet=1000, wordpieces_prefix="##")
tokenizer.save("./vocab.json", pretty=True)
# vocab.json to vocab.txtwith open('./vocab.json') as f: d = json.load(f)
vocab = d['model']['vocab']
vocab_txt = ''
for k, v in vocab.items(): vocab_txt += k vocab_txt += '\n'
vocab_txt = vocab_txt[:-1]
with open('./vocab.txt', 'wt') as f: f.write(vocab_txt)References
https://github.com/huggingface/tokenizers/tree/master/bindings/python
https://github.com/stefan-it/turkish-bert/blob/master/CHEATSHEET.md#cased-model