diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fab1cece29054c4d72ce50bbf2d211122bd0a68d --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,55 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "100": { + "content": "[UNK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "101": { + "content": "[CLS]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "102": { + "content": "[SEP]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "103": { + "content": "[MASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "clean_up_tokenization_spaces": true, + "cls_token": "[CLS]", + "do_lower_case": true, + "mask_token": "[MASK]", + "model_max_length": 512, + "pad_token": "[PAD]", + "sep_token": "[SEP]", + "strip_accents": null, + "tokenize_chinese_chars": true, + "tokenizer_class": "DistilBertTokenizer", + "unk_token": "[UNK]" +}