openwebtext下载清洗教程

发布时间：2024-12-08 04:41

下载官方洗衣教程，提升家庭洗衣技能 #生活常识# #日常生活小窍门# #衣物护理小技巧# #衣物护理APP#

1. 首先去hugging face下载压缩包，链接：Skylion007/openwebtext at main (huggingface.co)

2. 将21个压缩包中的全部文档提取到一个文件夹中。

import lzma

import os

import tarfile

from pathlib import Path

def decompress_xz_files(src_dir, dest_dir, start_index=1, end_index=1000):

"""Decompress .xz files containing multiple documents and copy each document to the destination directory."""

if not os.path.exists(dest_dir):

os.makedirs(dest_dir)

for i in range(start_index, end_index + 1):

src_file = f"{src_dir}/urlsf_subset20-{i}_data.xz"

if os.path.exists(src_file):

if tarfile.is_tarfile(src_file):

with tarfile.open(src_file, mode='r:xz') as tar:

tar.extractall(path=dest_dir)

print(f"Extracted all contents of {src_file} to {dest_dir}")

else:

dest_file_path = os.path.join(dest_dir, f"extracted_content_{i}.txt")

with lzma.open(src_file, 'rt') as file:

content = file.read()

with open(dest_file_path, 'w') as out_file:

out_file.write(content)

print(f"Decompressed and copied content from {src_file} to {dest_file_path}")

else:

print(f"File {src_file} does not exist")

source_directory = 'D:/large_model/data/urlsf_subset20/openwebtext'

destination_directory = 'D:/large_model/data/data'

decompress_xz_files(source_directory, destination_directory)

3. 将文件夹下的所有txt文档合并成一个大的json文件，记得在运行以下代码时添加--data_path和--output_file参数。

import glob

import sys

import json

import argparse

if __name__ == '__main__':

parser = argparse.ArgumentParser()

parser.add_argument("--data_path", type=str, default=".",

help="path where all the json files are located")

parser.add_argument("--output_file", type=str, default="merged_output.json",

help="filename where the merged json should go")

args = parser.parse_args()

data_path = args.data_path

out_file = args.output_file

text_files = glob.glob(data_path + '/*.txt')

counter = 0

with open(out_file, 'w', encoding='UTF-8') as outfile:

for fname in text_files:

counter += 1

if counter % 1024 == 0:

print("Merging at ", counter, flush=True)

with open(fname, 'r', encoding='UTF-8') as infile:

for row in infile:

tmp = {}

tmp['text'] = row

outfile.write(json.dumps(tmp))

outfile.write('\n')

print("Merged file", out_file, flush=True)

4. 数据清洗，删除掉少于 128 个token的文档，这一步会很久，我对以下代码进行了修改，使用12个线程同时运行，清洗后的数据从42G变成了11G。

import ftfy

import json

from langdetect import detect

import numpy as np

import time

import os

import sys

from tokenizer import Tokenizer

MIN_DOCUMENT_LENGHT = 128

def print_progress(prefix, start_time, num_docs, num_fixed_text,

num_non_english_docs, chars_non_english_docs,

num_small_docs, chars_small_docs):

string = prefix + ' | '

string += 'elapsed time: {:.2f} | '.format(time.time() - start_time)

string += 'documents: {} | '.format(num_docs)

string += 'fixed text: {} | '.format(num_fixed_text)

string += 'non-english: {} | '.format(num_non_english_docs)

string += 'non-english chars: {} | '.format(chars_non_english_docs)

string += 'small docs: {} | '.format(num_small_docs)

string += 'small docs chars: {}'.format(chars_small_docs)

print(string, flush=True)

def filter_corpus(filename, out_filename, print_interval=10000):

print(' > filtering {}'.format(filename))

tokenizer = Tokenizer(cache_dir='./cache')

num_docs = 0

num_written_docs = 0

num_small_docs = 0

num_fixed_text = 0

num_non_english_docs = 0

chars_non_english_docs = 0

chars_small_docs = 0

start_time = time.time()

with open(out_filename, 'wb') as f:

with open(filename, 'r') as fin:

for line in fin:

try:

num_docs += 1

myjson = json.loads(line)

text = ftfy.fix_text(myjson['text'])

if text != myjson['text']:

num_fixed_text += 1

myjson['text'] = text

if detect(text) != 'en':

print('[non-english text]', myjson)

num_non_english_docs += 1

chars_non_english_docs += len(text)

continue

if len(text) < (8 * MIN_DOCUMENT_LENGHT):

tokens = tokenizer.tokenize_document(text)

if len(tokens) < MIN_DOCUMENT_LENGHT:

print('[small document, skipping]:', myjson)

num_small_docs += 1

chars_small_docs += len(text)

continue

myjson = json.dumps(myjson, ensure_ascii=False)

f.write(myjson.encode('utf-8'))

f.write('\n'.encode('utf-8'))

num_written_docs += 1

if num_docs % print_interval == 0:

print_progress('[PROGRESS]', start_time, num_docs,

num_fixed_text, num_non_english_docs,

chars_non_english_docs,

num_small_docs, chars_small_docs)

except Exception as e:

print(' skipping ', line, e)

print_progress('[FINAL]', start_time, num_docs,

num_fixed_text, num_non_english_docs,

chars_non_english_docs,

num_small_docs, chars_small_docs)

if __name__ == '__main__':

print('building gpt2 dataset ...')

input_filename = sys.argv[1]

output_filename = sys.argv[2]

print('will be reading {}'.format(input_filename))

print('and will write the results to {}'.format(output_filename))

filter_corpus(input_filename, output_filename)

5. 使用shuf打乱清洗后的数据集

shuffle merge_cleand.json -o train_data.json

6. 数据预处理

python tools/preprocess_data.py \

--input /workspace/data/train_data.json \

--output-prefix /workspace/data/my-gpt2 \

--vocab-file /workspace/model/gpt2-vocab/gpt2-vocab.json\

--dataset-impl mmap \

--tokenizer-type GPT2BPETokenizer \

--merge-file /workspace/model/gpt2-vocab/gpt2-merges.txt \

--append-eod \

--workers 20 \

--chunk-size 25

将数据输出文件名为 my-gpt2_text_document.bin 和 my-gpt2_text_document.idx，用于GPT2预训练使用。

有疑问或者需要上述文件的，包括需要清洗后的数据集私信我。

网址：openwebtext下载清洗教程 https://www.yuejiaxmz.com/news/view/408874

⬅️上一篇：特纳斯电子

➡️下一篇：环卫工人早起清洁，竟被超速车辆当

openwebtext下载清洗教程

相关内容

随便看看

最新动态分享

热点动态分享

专题

推荐动态分享