跳转到内容

导出训练格式

python
import pandas as pd

class SFTExporter:
    def __init__(self, df):
        self.df = df
    
    def build_messages(self):
        self.df['messages'] = self.df.apply(
            lambda r: [
                {"role": "user", "content": str(r.get('user_message', ''))},
                {"role": "assistant", "content": str(r.get('assistant_message', ''))}
            ], axis=1
        )
        return self
    
    def split_and_export(self, val_ratio=0.05, output_dir='./sft_output'):
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        n = len(self.df)
        val_n = int(n * val_ratio)
        
        train = self.df.iloc[val_n:]
        val = self.df.iloc[:val_n]
        
        train[['messages']].to_json(
            f'{output_dir}/train.jsonl', orient='records', lines=True,
            force_ascii=False, indent=None
        )
        val[['messages']].to_json(
            f'{output_dir}/val.jsonl', orient='records', lines=True,
            force_ascii=False, indent=None
        )
        
        print(f"导出完成:")
        print(f"  Train: {len(train):,} 条 → {output_dir}/train.jsonl")
        print(f"  Val:   {len(val):,} 条 → {output_dir}/val.jsonl")

exporter = SFTExporter(df)
exporter.build_messages().split_and_export()

基于 MIT 许可发布