import timeimport requestsimport jsonfrom services.ChatService import ChatServicefrom deepeval.models import DeepEvalBaseLLMfrom deepeval.test_case import LLMTestCasefrom deepeval.metrics import (FaithfulnessMetric, ContextualPrecisionMetric, ContextualRecallMetric, ContextualRelevancyMetric)
class QwenModel(DeepEvalBaseLLM): def __init__(self): self.api_key = "fastgpt-*******" self.base_url = "https://jz-fastgpt-stable.djtest.cn/api/v1" self.model_name = "qwen-max" def load_model(self): return self def generate(self, prompt: str) -> str: headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json" } payload = { "model": self.model_name, "messages": [{"role": "user", "content": prompt}], "temperature": 0 } response = requests.post( f"{self.base_url}/chat/completions", headers=headers, data=json.dumps(payload) ) if response.status_code == 200: return response.json()["choices"][0]["message"]["content"] else: raise RuntimeError(f"API 调用失败: {response.status_code}, {response.text}") async def a_generate(self, prompt: str) -> str: return self.generate(prompt) def get_model_name(self): return self.model_name
class EvalService: def get_faithfulness(self,ques:str, response): qwen_model = QwenModel() faithfulness_metric = FaithfulnessMetric(model=qwen_model) test_case = self.get_test_case(ques,response) faithfulness_metric.measure(test_case) faithfulness = dict() faithfulness["score"] = faithfulness_metric.score faithfulness["reason"] = faithfulness_metric.reason print(f"faithfulness:{faithfulness}") return faithfulness def get_contextprecision(self, ques: str, response): qwen_model = QwenModel() contextprecision_metric = ContextualPrecisionMetric(model=qwen_model) test_case = self.get_test_case(ques,response) contextprecision_metric.measure(test_case) contextprecision = dict() contextprecision["score"] = contextprecision_metric.score contextprecision["reason"] = contextprecision_metric.reason print(f"contextprecision:{contextprecision}") return contextprecision def get_contextrecall(self, ques: str, response): qwen_model = QwenModel() contextrecall_metric = ContextualRecallMetric(model=qwen_model) test_case = self.get_test_case(ques,response) contextrecall_metric.measure(test_case) contextrecall = dict() contextrecall["score"] = contextrecall_metric.score contextrecall["reason"] = contextrecall_metric.reason print(f"contextrecall:{contextrecall}") return contextrecall def get_contextrelevant(self, ques: str, response): qwen_model = QwenModel() contextrelevant_metric = ContextualRelevancyMetric(model=qwen_model) test_case = self.get_test_case(ques,response) contextrelevant_metric.measure(test_case) contextrelevant = dict() contextrelevant["score"] = contextrelevant_metric.score contextrelevant["reason"] = contextrelevant_metric.reason print(f"contextrelevant:{contextrelevant}") return contextrelevant def get_test_case(self, ques: str, result): quote_list = result["responseData"][1]["quoteList"] retrival_context = [] for quote in quote_list: retrival_context.append(f"{quote['q']}:{quote['a']}") context = [] historypreview = result["responseData"][2]["historyPreview"] for history in historypreview: context.append(history['value']) answer = result["choices"][0]["message"]["content"] res_case = LLMTestCase( input=ques, actual_output=answer, expected_output=answer, context=context, retrieval_context=retrival_context ) return res_case
if __name__ == "__main__": url='https://XXXXXX/api/v1/chat/completions' key='fastgpt-XXXXXX'
cr=ChatService(url,key)
#调用ai应用,得到result result=cr.question_response("XXX怎么收费?") es = EvalService()
es.get_faithfulness("XXX怎么收费?", result) es.get_contextprecision("XXX怎么收费?", result) es.get_contextrecall("XXX怎么收费?", result) es.get_contextrelevant("XXX怎么收费?", result)
faithfulness:{'score': 1.0, 'reason': '实际输出与检索上下文完全一致,没有任何矛盾之处,所以得到了满分1.00的忠实度评分。'}contextprecision:{'score': 1.0, 'reason': '得分为1.00,因为相关的节点(即第一个节点)被正确地排在了最前面。'}contextrecall:{'score': 0.5, 'reason': '分数为0.50,因为虽然节点在检索上下文中提到了'}contextrelevant:{'score': 0.16666666666666666, 'reason': "分数为0.17,因为大部分检索内容并未涉及XXX问题,例如……"}