Spaces:

kernelmachine
/

gpt3-quality-filter

Runtime error

App Files Files Community

gpt3-quality-filter / lr /util.py

kernelmachine

update

2c5347a almost 4 years ago

raw

history blame contribute delete

1.82 kB

	import os
	import json
	import numpy as np
	import pandas as pd

	def load_huggingface_tokenizer(tokenizer_path: str):
	with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
	config = json.load(f)
	tokenizer_type = config['tokenizer_type']
	tokenizer = {'BPE': BPETokenizer,
	'BBPE': ByteLevelBPETokenizer,
	'BERT': BertWordPieceTokenizer}[tokenizer_type]
	if tokenizer_type in ['BPE', 'BBPE']:
	vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
	merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
	tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
	merges_file=os.path.join(tokenizer_path, merges_file))
	else:
	vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
	tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
	return tokenizer


	def jackknife(data, num_partitions=5):
	data = data.sample(frac=1)
	splits = np.split(data, range(0, data.shape[0], int(data.shape[0]/num_partitions) )[1:])
	for i, split in enumerate(splits):
	train_parts = list(range(0, num_partitions))
	try:
	train_parts.remove(i)
	yield pd.concat([splits[ix] for ix in train_parts], 0), split
	except ValueError:
	continue


	def stratified_sample(df, col, n_samples):
	n = min(n_samples, df[col].value_counts().min())
	rand_int = np.random.randint(1, 10000)
	df_ = df.groupby(col).apply(lambda x: x.sample(n, random_state=rand_int))
	df_.index = df_.index.droplevel(0)
	return df_


	def replace_bool(x):
	if x == 'true':
	return 1
	elif x == 'false':
	return 0
	else:
	return x