大模型LORA微调总结

发布时间:2024年01月16日

大模型微调总结

模型加载

使用deepspeed

 model = transformers.AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=training_args.cache_dir,
            torch_dtype='auto',
            # if model_args.model_name_or_path.find("falcon") != -1 else False
            trust_remote_code=True
        )

不使用deepspeed

model = transformers.AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=training_args.cache_dir,
            device_map='auto',
            torch_dtype='auto',
            # if model_args.model_name_or_path.find("falcon") != -1 else False
            trust_remote_code=True
        )

使用lora

from peft import LoraConfig, get_peft_model
LORA_R = 32
# LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = [
"o_proj","gate_proj", "down_proj", "up_proj"
]

config = LoraConfig(
r=LORA_R,
# lora_alpha=LORA_ALPHA,
target_modules=TARGET_MODULES,
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
#加载配置
model = get_peft_model(model, config)
#打印训练参数比例
model.print_trainable_parameters()

加载分词器

tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True)

数据加载

通过Hugging Face的dateset库进行加载数据

使用dateset可以轻松加载数据,样例如下所示:

from datasets import load_dataset
dataset = load_dataset('csv', data_files='my_file.csv')
dataset = load_dataset('csv', data_files=['my_file_1.csv', 'my_file_2.csv', 'my_file_3.csv'])
dataset = load_dataset('csv', data_files={'train':['my_train_file_1.csv','my_train_file_2.csv'],'test': 'my_test_file.csv'})

我们可以按下面方式加载数据

def load_dataset_from_own(data_path: Optional[str] = None,
                           cache_dir: Optional[str] = "cache_data") -> Dataset:
    all_file_list = ['a.json','b.json','c.json']
    data_files = {'train': all_file_list}
    extension = all_file_list[0].split(".")[-1]
    datasets = load_dataset(
        extension,
        data_files=data_files,
        cache_dir=cache_dir,
    )['train']
    return datasets

构建source和target

  1. 构建prompt
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}
  1. 根据prompt构建source
sources = [prompt_input.format_map({'instruction': ins_data[i], 'input': input_data[i]}) if input_data[
            i] != "" else prompt_no_input.format_map(
            {'instruction': ins_data[i]})
            for i in range(len_)]
#限制长度
sources = [i[:data_args.source_length] for i in sources]
  1. 根据prompt构建targets
targets = [f"{example[:data_args.target_length-1]}{tokenizer.eos_token}" for example in output]

构建input_ids和labels

输入需要构建的text,输出构建好的ids

def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for text in strings
    ]
    #获得ids
    input_ids = labels = [tokenized.input_ids[0]
                          for tokenized in tokenized_list]
    #终止符设置
    ne_pad_token_id = IGNORE_INDEX if tokenizer.pad_token_id is None else tokenizer.pad_token_id
    #统计长度
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(ne_pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )

构建input_ids 和label

examples = [s + t for s, t in zip(sources, targets)]
#问题+答案、问题
examples_tokenized, sources_tokenized = [_tokenize_fn(
    strings, tokenizer) for strings in (examples, sources)]
input_ids = examples_tokenized["input_ids"]
labels = copy.deepcopy(input_ids)
#构建labels
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
    label[:source_len] = IGNORE_INDEX

标签补齐

在动态batching中我们需要一个data collator完成padding。这里不适用DataCollatorWithPadding来进行补齐操作,因为这个函数仅对输入的键(包括input_ids, attention_mask, token_type_ids)进行补齐,不会对labels进行补齐操作。还有在对labels进行补齐操作时,使用的是-100而不是分词器的pad_token,这么做到的目的是在计算损失函数的时候忽略掉这些padding token。

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model,
label_pad_token_id=IGNORE_INDEX)

构建训练器

from transformers import DataCollatorForSeq2Seq, Trainer
trainer = Trainer(model=model,
	tokenizer=tokenizer,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=None,
	data_collator=data_collator)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)

LORA模型推理

模型加载

base_model_name_or_path = "internlm-7b"
lora_model_name_or_path ="checkpoint-9695"


model = AutoModelForCausalLM.from_pretrained(
    base_model_name_or_path,
    torch_dtype="auto",
    # device_map="auto",
    # if model_args.model_name_or_path.find("falcon") != -1 else False
    trust_remote_code=True,
).cuda(0)

model = PeftModel.from_pretrained(model, model_id=lora_model_name_or_path)
model.eval()
print("ok")

tokenizer = AutoTokenizer.from_pretrained(
    base_model_name_or_path, trust_remote_code=True, padding_side="left"
)

多batch推理构建

def batch_generate_data(
    text_input: List[str], use_train_model: bool = True, temp: float = 0.7
):
    text_input_format = [generate_input(i) for i in text_input]
    batch_inputs = tokenizer.batch_encode_plus(
        text_input_format, padding="longest", return_tensors="pt"
    )
    batch_inputs["input_ids"] = batch_inputs["input_ids"].cuda()
    batch_inputs["attention_mask"] = batch_inputs["attention_mask"].cuda()

    if use_train_model:
        # with model.disable_adapter():
        outputs = model.generate(
            **batch_inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=temp,
            top_p=0.8,
        )
    else:
        with model.disable_adapter():
            outputs = model.generate(
                **batch_inputs,
                max_new_tokens=256,
                do_sample=True,
                temperature=temp,
                top_p=0.8,
            )
    outputs = tokenizer.batch_decode(
        outputs.cpu()[:, batch_inputs["input_ids"].shape[-1] :],
        skip_special_tokens=True,
    )

    return outputs

lora微调推理

text_input = ["工作压力太大怎么办\n"] * 32
# lora 训练结果
batch_generate_data(text_input, use_train_model=True, temp=0.8)
# 原来的模型
batch_generate_data(text_input, use_train_model=False, temp=0.8)

合并模型权重

model = model.merge_and_unload()
model.save_pretrained("internlm-7b-lml")
tokenizer.save_pretrained("internlm-7b-lml")
文章来源:https://blog.csdn.net/qq128252/article/details/135623991
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。