通过 如何用TRL微调大模型(LLMs) 和 微调 FLAN-T5 以实现聊天和对话摘要 两篇文章的学习,我们已经知道了如何通过Hugging Face来微调大模型。这篇文章,我们来实现微调FLAN-T5的情感分析任务。
环境配置和之前一样,我们之间从加载处理数据集开始。
加载并处理数据集
我们使用 t1annnnn/Chinese_sentimentAnalyze 数据集,这是Hugging Face上的一个中文情感分析数据集。
首先,我们使用Hugging Face的load_dataset
方法加载数据集。
dataset_id = "t1annnnn/Chinese_sentimentAnalyze"
# Load dataset from the hub
dataset = load_dataset(dataset_id)
接下来,我们需要将数据集转换为Dataset对象格式的数据。
def deal_raw_data(dataset, id2label):
# 分割数据集
train_data = dataset['train']
test_data = dataset['test']
validation_data = dataset['validation']
# 构造 train, test, validation 字典
train_labels = [id2label[str(label)] for label in train_data['label']]
test_labels = [id2label[str(label)] for label in test_data['label']]
validation_labels = [id2label[str(label)] for label in validation_data['label']]
train_dict = [{'text': text, 'label': label} for text, label in zip(train_data['text'], train_labels)]
test_dict = [{'text': text, 'label': label} for text, label in zip(test_data['text'], test_labels)]
validation_dict = [{'text': text, 'label': label} for text, label in
zip(validation_data['text'], validation_labels)]
# 将列表转换为 Pandas DataFrame
train_df = pd.DataFrame(train_dict)
test_df = pd.DataFrame(test_dict)
validation_df = pd.DataFrame(validation_dict)
# 将 DataFrame 转换为 Dataset 对象
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)
# 构造包含 train, test, validation 的字典
return {
'train': train_dataset,
'test': test_dataset,
'validation': validation_dataset
}
id2label = {
'0': 'negative',
'1': 'positive'
}
raw_dataset = deal_raw_data(dataset, id2label)
raw_dataset = DatasetDict(raw_dataset)
然后,再把数据处理成模型需要的token。
model_id = "google/flan-t5-small"
# 加载 FLAN-t5-small 的分词器
tokenizer = AutoTokenizer.from_pretrained(model_id)
# 分词之后的最大长度
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=["text", "label"])
# 确定标记化处理后的文本序列的最大长度的。在情感分析任务中,我们需要将文本转换为模型可以理解的token表示,以便输入到模型中进行训练或推断。在这个过程中,为了保持输入序列的统一长度,我们通常会对文本进行填充(padding)或截断(truncation)。而最大长度则是我们所能接受的最大序列长度,超过这个长度的序列会被截断,不足这个长度的序列会被填充。
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
def preprocess_function(sample, padding="max_length"):
# add prefix to the input for t5
inputs = ["sentiment: " + item for item in sample["text"]]
# tokenize inputs
model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
# Tokenize targets with the `text_target` keyword argument
labels = tokenizer(sample["label"], padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss.
if padding == "max_length":
labels["input_ids"] = [
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_dataset = raw_dataset.map(preprocess_function, batched=True, remove_columns=["text", "label"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
设置评估指标和方法
评估指标我们用F1
# evaluate use F1
# Metric
metric = evaluate.load("f1")
def postprocess_text(predictions, labels):
label2id = {
'negative': '0',
'positive': '1'
}
predictions = [prediction.strip() for prediction in predictions]
labels = [label2id[label.strip()] for label in labels]
for idx in range(len(predictions)):
if predictions[idx] in label2id:
predictions[idx] = label2id[predictions[idx]]
else:
predictions[idx] = '-100'
return predictions, labels
def compute_metrics(eval_pred):
predictions, labels = eval_pred
if isinstance(predictions, tuple):
predictions = predictions[0]
predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
predictions, labels = postprocess_text(predictions, labels)
result = metric.compute(predictions=predictions, references=labels)
return result
训练模型
开始训练前,先要创建一个数据整理器(DataCollator)。
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=8
)
# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"
设置训练参数
# Define training args
training_args = Seq2SeqTrainingArguments(
output_dir=repository_id,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
predict_with_generate=True,
fp16=False, # Overflows with fp16
learning_rate=5e-5,
num_train_epochs=3,
# logging & evaluation strategies
logging_dir=f"{repository_id}/logs",
logging_strategy="steps",
logging_steps=500,
evaluation_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
load_best_model_at_end=True,
# metric_for_best_model="overall_f1",
# push to hub parameters
report_to="tensorboard",
push_to_hub=False,
hub_strategy="every_save",
hub_model_id=repository_id,
hub_token=HfFolder.get_token(),
)
# Create Trainer instance
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
compute_metrics=compute_metrics,
)
# Start training
trainer.train()
trainer.evaluate()
推送到Hugging Face
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()