下载模型https://huggingface.co/uer/gpt2-chinese-cluecorpussmall/tree/main
# -*- encoding=utf-8 -*-
import os
import json
import math
import pandas as pd
from typing import List
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForCausalLM, AutoTokenizer
def init_ppl_model():
"""
初始化perplexity
"""
device = "cpu"
base_dir = os.path.dirname(os.path.abspath(__file__))
model_id = os.path.join(base_dir, "./gpt2-chinese-cluecorpussmall")
model = AutoModelForCausalLM.from_pretrained(model_id, is_decoder=True)
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
return model, tokenizer
def get_ppl_longllmlingua(text, question, model, tokenizer):
"""
采用llmlingua计算ppl
"""
def get_token_length(text: str, add_special_tokens: bool = True):
return len(
tokenizer(text, add_special_tokens=add_special_tokens).input_ids
)
granularity = "sentence"
ppl =