跳转到内容

HuggingFace Datasets 转 NumPy

HuggingFace datasets 库是处理大规模数据集的工具。本篇文章介绍如何将 HuggingFace datasets 格式的数据转换为 NumPy 数组,以便在纯 NumPy 环境中处理。

HuggingFace Datasets 基本结构

python
import numpy as np

class MockHFdataset:
    """模拟 HuggingFace Dataset 结构"""

    def __init__(self, data_dict):
        self.data = data_dict
        self.num_rows = len(next(iter(data_dict.values())))

    def __len__(self):
        return self.num_rows

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.data.items()}

    def to_numpy(self):
        """转换为 NumPy 数组字典"""
        return {k: np.array(v) for k, v in self.data.items()}

# 示例
mock_data = {
    'input_ids': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1]]
}

dataset = MockHFdataset(mock_data)
print(f"数据集大小: {len(dataset)}")
print(f"样本 0: {dataset[0]}")

数据集转 NumPy

python
def dataset_to_numpy(dataset):
    """将 HuggingFace Dataset 转换为 NumPy 数组

    参数:
        dataset: HuggingFace Dataset 对象
    返回:
        numpy_data: 包含 NumPy 数组的字典
    """
    numpy_data = {}

    for key in dataset.features:
        numpy_data[key] = np.array(dataset[key])

    return numpy_data

# 示例
mock_data = {
    'input_ids': [[1, 2, 3, 0, 0], [4, 5, 6, 7, 0]],
    'attention_mask': [[1, 1, 1, 0, 0], [1, 1, 1, 1, 0]],
    'labels': [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]]
}

dataset = MockHFdataset(mock_data)
numpy_data = dataset_to_numpy(dataset)

print("转换后的 NumPy 数组:")
for key, arr in numpy_data.items():
    print(f"  {key}: shape={arr.shape}, dtype={arr.dtype}")

批量转换大数据集

python
def dataset_to_numpy_batched(dataset, batch_size=1000):
    """分批将数据集转换为 NumPy 数组

    适用于大型数据集,避免内存溢出
    """
    numpy_data = None

    for i in range(0, len(dataset), batch_size):
        end = min(i + batch_size, len(dataset))

        # 获取 batch
        batch = dataset[i:end]
        batch_arrays = {k: np.array(v) for k, v in batch.items()}

        # 合并
        if numpy_data is None:
            numpy_data = {k: [arr] for k, arr in batch_arrays.items()}
        else:
            for k, arr in batch_arrays.items():
                numpy_data[k].append(arr)

    # 拼接
    for k in numpy_data:
        numpy_data[k] = np.concatenate(numpy_data[k])

    return numpy_data

# 示例
large_mock = {
    'input_ids': [list(range(i, i+10)) for i in range(100)],
    'labels': [[i % 2] * 10 for i in range(100)]
}

large_dataset = MockHFdataset(large_mock)
numpy_data = dataset_to_numpy_batched(large_dataset, batch_size=30)

print(f"大数据集转换: {numpy_data['input_ids'].shape}")

保存和加载

python
def save_numpy_data(data, filepath_prefix):
    """将 NumPy 数据保存到文件

    参数:
        data: NumPy 数组字典
        filepath_prefix: 文件路径前缀
    """
    for key, arr in data.items():
        filepath = f"{filepath_prefix}_{key}.npy"
        np.save(filepath, arr)
        print(f"保存 {key}{filepath}")

def load_numpy_data(filepath_prefix, keys):
    """加载 NumPy 数据"""
    data = {}
    for key in keys:
        filepath = f"{filepath_prefix}_{key}.npy"
        data[key] = np.load(filepath)
    return data

# 示例
# save_numpy_data(numpy_data, 'train_data')
# loaded = load_numpy_data('train_data', ['input_ids', 'attention_mask'])

掌握 HuggingFace datasets 与 NumPy 的转换对于高效处理训练数据非常重要。

基于 MIT 许可发布