-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathtokenizer.py
142 lines (114 loc) · 4.67 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import argparse
import json
import os
import sys
import numpy as np
current_dir = os.path.dirname(os.path.abspath(__file__))
model_path = os.path.join(current_dir, "tokenizer_internlm.model")
sys.path.append(os.path.join(current_dir, "../transformers"))
from internlm_model import InternLMTokenizer # noqa: E402 # pylint: disable=C0413
tokenizer = InternLMTokenizer(vocab_file=model_path, add_bos_token=True, add_eos_token=True)
def write_bin(context: str, bin_file) -> None:
"""
Write bin file based on the context.
Args:
context (str): the context of raw file.
bin_file (file handler): the opened bin file.
Example:
>>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt'
>>> out.bin
>>> {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]}
"""
# encode the context into tokens, which is a list, eg. [67577, 69095, 63010, 61770, 67783, 69301, 74732]
tokens = tokenizer.encode(context)
# transfer the list into dic, key is str 'tokens', value is tokens.
# eg. {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]}
data = dict(tokens=tokens)
# encode the data into bytes to save
saved_bin = str.encode(json.dumps(data) + "\n")
# write bytes into bin_file
bin_file.write(saved_bin)
def prepare_meta(bin_output_path: str):
"""
Prepare metadata for the given bin file.
Args:
bin_output_path (str): Output bin file path.
"""
meta = []
cur = 0
with open(bin_output_path, "rb") as f:
while True:
# read lines
line = f.readline()
# if line is empty, then break
if line == b"":
break
# obtain the token amount of each line
length = len(json.loads(line)["tokens"])
# meta is a list of tuple(cur, length)
# cur: the start index of each line
# length: the token amount of each line
meta.append((cur, length))
# update the cur to generate the meta information of next line
cur += len(line)
# define path of the generated meta file
meta_fp = bin_output_path + ".meta"
# save the generated meta information
with open(meta_fp, "wb") as f:
meta = np.array(meta, dtype=np.int32)
np.save(f, meta)
def text2bin(text_input_path: str, bin_output_path: str):
"""
Read content from the input file and write to bin file.
Currently support 3 input formats: 'txt', 'json' and 'jsonl'.
Args:
text_input_path (str): txt file path.
bin_output_path (str): output bin file path.
"""
# Check if the txt file exists
if not os.path.isfile(text_input_path):
raise FileNotFoundError(f"{text_input_path} does not exist.")
file_format = text_input_path.split(".")[-1]
assert file_format in ["txt", "json", "jsonl"], print(
"Invalid input file type. Currently support `txt`, `json` and `jsonl`."
)
with open(text_input_path, "r") as text_file, open(bin_output_path, "ab") as bin_file:
if file_format == "txt":
for line in text_file:
# Strip any leading/trailing whitespace
stripped_line = line.strip()
if stripped_line:
# Pass each line to the write_bin function
write_bin(stripped_line, bin_file)
elif file_format == "json":
data = json.load(text_file)
# assuming data is a list of dictionaries
for record in data:
# the type of record is dict, transfer the dict into str
context = json.dumps(record)
# encode the str and write into bin
write_bin(context, bin_file)
elif file_format == "jsonl":
for line in text_file:
# encode the str and write into bin
write_bin(line, bin_file)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--text_input_path",
type=str,
required=True,
help="Path to the input text file.",
)
parser.add_argument("--bin_output_path", type=str, required=True, help="Path to the output bin file.")
return parser.parse_args()
def main():
# parse arguments
args = parse_args()
text2bin(args.text_input_path, args.bin_output_path)
print(f"Successfully converted {args.text_input_path} to {args.bin_output_path}")
# To avoid potential read/write errors, the metadata preparation follows after creating the .bin file.
prepare_meta(args.bin_output_path)
print(f"Successfully generated {args.bin_output_path}.meta")
if __name__ == "__main__":
main()