openwebtext下载清洗教程
下载官方洗衣教程,提升家庭洗衣技能 #生活常识# #日常生活小窍门# #衣物护理小技巧# #衣物护理APP#
1. 首先去hugging face下载压缩包,链接:Skylion007/openwebtext at main (huggingface.co)
2. 将21个压缩包中的全部文档提取到一个文件夹中。
import lzma
import os
import tarfile
from pathlib import Path
def decompress_xz_files(src_dir, dest_dir, start_index=1, end_index=1000):
"""Decompress .xz files containing multiple documents and copy each document to the destination directory."""
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
for i in range(start_index, end_index + 1):
src_file = f"{src_dir}/urlsf_subset20-{i}_data.xz"
if os.path.exists(src_file):
if tarfile.is_tarfile(src_file):
with tarfile.open(src_file, mode='r:xz') as tar:
tar.extractall(path=dest_dir)
print(f"Extracted all contents of {src_file} to {dest_dir}")
else:
dest_file_path = os.path.join(dest_dir, f"extracted_content_{i}.txt")
with lzma.open(src_file, 'rt') as file:
content = file.read()
with open(dest_file_path, 'w') as out_file:
out_file.write(content)
print(f"Decompressed and copied content from {src_file} to {dest_file_path}")
else:
print(f"File {src_file} does not exist")
source_directory = 'D:/large_model/data/urlsf_subset20/openwebtext'
destination_directory = 'D:/large_model/data/data'
decompress_xz_files(source_directory, destination_directory)
'3. 将文件夹下的所有txt文档合并成一个大的json文件,记得在运行以下代码时添加--data_path和--output_file参数。
import glob
import sys
import json
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str, default=".",
help="path where all the json files are located")
parser.add_argument("--output_file", type=str, default="merged_output.json",
help="filename where the merged json should go")
args = parser.parse_args()
data_path = args.data_path
out_file = args.output_file
text_files = glob.glob(data_path + '/*.txt')
counter = 0
with open(out_file, 'w', encoding='UTF-8') as outfile:
for fname in text_files:
counter += 1
if counter % 1024 == 0:
print("Merging at ", counter, flush=True)
with open(fname, 'r', encoding='UTF-8') as infile:
for row in infile:
tmp = {}
tmp['text'] = row
outfile.write(json.dumps(tmp))
outfile.write('\n')
print("Merged file", out_file, flush=True)
'4. 数据清洗,删除掉少于 128 个token的文档,这一步会很久,我对以下代码进行了修改,使用12个线程同时运行,清洗后的数据从42G变成了11G。
import ftfy
import json
from langdetect import detect
import numpy as np
import time
import os
import sys
from tokenizer import Tokenizer
MIN_DOCUMENT_LENGHT = 128
def print_progress(prefix, start_time, num_docs, num_fixed_text,
num_non_english_docs, chars_non_english_docs,
num_small_docs, chars_small_docs):
string = prefix + ' | '
string += 'elapsed time: {:.2f} | '.format(time.time() - start_time)
string += 'documents: {} | '.format(num_docs)
string += 'fixed text: {} | '.format(num_fixed_text)
string += 'non-english: {} | '.format(num_non_english_docs)
string += 'non-english chars: {} | '.format(chars_non_english_docs)
string += 'small docs: {} | '.format(num_small_docs)
string += 'small docs chars: {}'.format(chars_small_docs)
print(string, flush=True)
def filter_corpus(filename, out_filename, print_interval=10000):
print(' > filtering {}'.format(filename))
tokenizer = Tokenizer(cache_dir='./cache')
num_docs = 0
num_written_docs = 0
num_small_docs = 0
num_fixed_text = 0
num_non_english_docs = 0
chars_non_english_docs = 0
chars_small_docs = 0
start_time = time.time()
with open(out_filename, 'wb') as f:
with open(filename, 'r') as fin:
for line in fin:
try:
num_docs += 1
myjson = json.loads(line)
text = ftfy.fix_text(myjson['text'])
if text != myjson['text']:
num_fixed_text += 1
myjson['text'] = text
if detect(text) != 'en':
print('[non-english text]', myjson)
num_non_english_docs += 1
chars_non_english_docs += len(text)
continue
if len(text) < (8 * MIN_DOCUMENT_LENGHT):
tokens = tokenizer.tokenize_document(text)
if len(tokens) < MIN_DOCUMENT_LENGHT:
print('[small document, skipping]:', myjson)
num_small_docs += 1
chars_small_docs += len(text)
continue
myjson = json.dumps(myjson, ensure_ascii=False)
f.write(myjson.encode('utf-8'))
f.write('\n'.encode('utf-8'))
num_written_docs += 1
if num_docs % print_interval == 0:
print_progress('[PROGRESS]', start_time, num_docs,
num_fixed_text, num_non_english_docs,
chars_non_english_docs,
num_small_docs, chars_small_docs)
except Exception as e:
print(' skipping ', line, e)
print_progress('[FINAL]', start_time, num_docs,
num_fixed_text, num_non_english_docs,
chars_non_english_docs,
num_small_docs, chars_small_docs)
if __name__ == '__main__':
print('building gpt2 dataset ...')
input_filename = sys.argv[1]
output_filename = sys.argv[2]
print('will be reading {}'.format(input_filename))
print('and will write the results to {}'.format(output_filename))
filter_corpus(input_filename, output_filename)
5. 使用shuf打乱清洗后的数据集
shuffle merge_cleand.json -o train_data.json
6. 数据预处理
python tools/preprocess_data.py \
--input /workspace/data/train_data.json \
--output-prefix /workspace/data/my-gpt2 \
--vocab-file /workspace/model/gpt2-vocab/gpt2-vocab.json\
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file /workspace/model/gpt2-vocab/gpt2-merges.txt \
--append-eod \
--workers 20 \
--chunk-size 25
将数据输出文件名为 my-gpt2_text_document.bin 和 my-gpt2_text_document.idx,用于GPT2预训练使用。
有疑问或者需要上述文件的,包括需要清洗后的数据集私信我。
网址:openwebtext下载清洗教程 https://www.yuejiaxmz.com/news/view/408874
相关内容
美食烹饪教程app下载美发教程app免费下载
清洗空调怎么操作流程 空调清洗方法步骤图文教程
一七生活洗衣下载
地毯清洗机的使用详解,视频教程、清洁与吸水指南地毯清洗机操作流程
carlife车机端下载安装教程
汽车内饰清洗教程
中华万年历首页、教程和下载–完美下载
远程医教协作软件下载
清洗整理达人手游下载