跳转到内容

质量分析报告

python
import pandas as pd
import numpy as np

class QualityReport:
    def __init__(self, df):
        self.df = df
    
    def generate(self):
        lines = []
        n = len(self.df)
        lines.append(f"\n{'='*60}")
        lines.append(f"  千万级对话语料 — 质量分析报告")
        lines.append(f"  总样本: {n:,}")
        lines.append(f"{'='*60}")
        
        na = self.df.isna().sum()
        has_na = na[na > 0]
        if len(has_na) > 0:
            lines.append(f"\n❌ 缺失值 ({len(has_na)} 列):")
            for col, cnt in has_na.items():
                lines.append(f"  {col}: {cnt:,} ({cnt/n*100:.2f}%)")
        
        dup = self.df.duplicated(subset=['user_message', 'assistant_message']).sum()
        lines.append(f"\n🔁 内容重复: {dup:,} ({dup/n*100:.2f}%)")
        
        num_cols = self.df.select_dtypes(include=[np.number]).columns
        if len(num_cols) > 0:
            desc = self.df[num_cols].describe().T[['count','mean','std','min','50%','max']]
            lines.append(f"\n📊 数值统计:\n{desc.round(2).to_string()}")
        
        return '\n'.join(lines)

report = QualityReport(df)
print(report.generate())

基于 MIT 许可发布