RAG排序策略探索——基于GPT2的PPL的方案

发布时间:2024年01月24日

1.准备GPT2大模型

下载模型https://huggingface.co/uer/gpt2-chinese-cluecorpussmall/tree/main

2.计算PPL进行排序

# -*- encoding=utf-8 -*-
import os
import json
import math
import pandas as pd
from typing import List

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForCausalLM, AutoTokenizer


def init_ppl_model():
    """
    初始化perplexity
    """
    device = "cpu"
    base_dir = os.path.dirname(os.path.abspath(__file__))
    model_id = os.path.join(base_dir, "./gpt2-chinese-cluecorpussmall")
    model = AutoModelForCausalLM.from_pretrained(model_id, is_decoder=True)
    model = model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    return model, tokenizer


def get_ppl_longllmlingua(text, question, model, tokenizer):
    """
    采用llmlingua计算ppl
    """
    def get_token_length(text: str, add_special_tokens: bool = True):
        return len(
            tokenizer(text, add_special_tokens=add_special_tokens).input_ids
        )

    granularity = "sentence"
    ppl =
文章来源:https://blog.csdn.net/jingyi130705008/article/details/135832358
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。