跳转到内容

词向量相似度可视化

词向量相似度是理解词嵌入质量的重要指标。语义相近的词应该具有相似的向量表示,因此它们的余弦相似度应该较高。本篇文章介绍如何使用 NumPy 计算词向量之间的相似度,并可视化词向量空间的结构。通过相似度分析,我们可以验证嵌入矩阵是否学习到了有意义的语义关系。

余弦相似度

余弦相似度是最常用的词向量相似度度量:

python
import numpy as np

def cosine_similarity(v1, v2):
    """计算两个向量的余弦相似度

    cos(θ) = (A · B) / (||A|| × ||B||)
    """
    dot_product = np.dot(v1, v2)
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    return dot_product / (norm1 * norm2 + 1e-8)

# 示例
np.random.seed(42)
v1 = np.random.randn(768)
v2 = np.random.randn(768)

similarity = cosine_similarity(v1, v2)
print(f"余弦相似度: {similarity:.4f}")

批量余弦相似度计算

python
def batch_cosine_similarity(vectors1, vectors2):
    """批量计算余弦相似度

    参数:
        vectors1: (n, d) 第一个向量组
        vectors2: (m, d) 第二个向量组
    返回:
        similarities: (n, m) 相似度矩阵
    """
    # 归一化向量
    norms1 = np.linalg.norm(vectors1, axis=1, keepdims=True)
    norms2 = np.linalg.norm(vectors2, axis=1, keepdims=True)

    normalized1 = vectors1 / (norms1 + 1e-8)
    normalized2 = vectors2 / (norms2 + 1e-8)

    # 矩阵乘法得到余弦相似度
    similarities = np.dot(normalized1, normalized2.T)

    return similarities

# 示例
np.random.seed(42)
vectors = np.random.randn(10, 768).astype(np.float32)

# 计算所有向量对之间的相似度
similarity_matrix = batch_cosine_similarity(vectors, vectors)
print(f"相似度矩阵形状: {similarity_matrix.shape}")
print(f"对角线元素(自身相似度): {np.diag(similarity_matrix)[:3]}")

词向量空间可视化

创建示例词向量

为了演示,我们创建一些带有语义结构的词向量:

python
def create_semantic_embeddings():
    """创建带有语义结构的词向量

    模拟:king - man + woman ≈ queen
    """
    np.random.seed(42)
    embed_dim = 64

    # 基础词汇
    words = ['king', 'man', 'woman', 'queen', 'prince', 'princess',
             'apple', 'fruit', 'orange', 'banana', 'car', 'bus', 'train', 'vehicle']

    # 创建嵌入向量
    embeddings = {}

    # 语义聚类:royalty
    embeddings['king'] = np.array([1.0, 0.5, 0.3] + [np.random.randn() * 0.1 for _ in range(embed_dim - 3)])
    embeddings['queen'] = embeddings['king'] + np.array([0.2, -0.1, 0.1] + [0] * (embed_dim - 3))
    embeddings['prince'] = embeddings['king'] + np.array([-0.1, 0.2, 0.1] + [0] * (embed_dim - 3))
    embeddings['princess'] = embeddings['queen'] + np.array([-0.1, 0.2, 0.1] + [0] * (embed_dim - 3))

    # 语义聚类:gender
    embeddings['man'] = np.array([0.5, 0.8, 0.2] + [np.random.randn() * 0.1 for _ in range(embed_dim - 3)])
    embeddings['woman'] = np.array([0.5, 0.8, 0.3] + [np.random.randn() * 0.1 for _ in range(embed_dim - 3)])

    # 语义聚类:fruit
    embeddings['apple'] = np.array([-0.5, -0.3, 0.8] + [np.random.randn() * 0.1 for _ in range(embed_dim - 3)])
    embeddings['fruit'] = embeddings['apple'] + np.array([0.1, 0.1, -0.2] + [0] * (embed_dim - 3))
    embeddings['orange'] = embeddings['apple'] + np.array([0.2, 0.1, 0.1] + [0] * (embed_dim - 3))
    embeddings['banana'] = embeddings['apple'] + np.array([-0.1, 0.2, 0.1] + [0] * (embed_dim - 3))

    # 语义聚类:vehicle
    embeddings['car'] = np.array([-0.3, 0.2, -0.6] + [np.random.randn() * 0.1 for _ in range(embed_dim - 3)])
    embeddings['bus'] = embeddings['car'] + np.array([0.1, -0.1, 0.2] + [0] * (embed_dim - 3))
    embeddings['train'] = embeddings['car'] + np.array([-0.1, 0.1, 0.3] + [0] * (embed_dim - 3))
    embeddings['vehicle'] = (embeddings['car'] + embeddings['bus'] + embeddings['train']) / 3

    return words, embeddings

words, embeddings = create_semantic_embeddings()
print(f"词汇列表: {words}")
print(f"嵌入维度: {len(embeddings['king'])}")

相似度分析

计算词对之间的相似度

python
def analyze_word_similarities(embeddings, words):
    """分析词对之间的相似度"""
    word_list = list(words)
    n = len(word_list)

    # 构建嵌入矩阵
    emb_matrix = np.array([embeddings[w] for w in word_list])

    # 计算相似度矩阵
    similarity_matrix = batch_cosine_similarity(emb_matrix, emb_matrix)

    return word_list, similarity_matrix

word_list, sim_matrix = analyze_word_similarities(embeddings, words)

print("\n=== 相似度分析 ===")

# 找出最相似的词对
def find_most_similar(word, word_list, similarity_matrix, top_k=5):
    """找出与给定词最相似的词"""
    idx = word_list.index(word)
    similarities = similarity_matrix[idx]

    # 获取排序索引(降序)
    sorted_indices = np.argsort(similarities)[::-1]

    results = []
    for i in sorted_indices[1:top_k+1]:  # 跳过自身
        results.append((word_list[i], similarities[i]))

    return results

# 示例
test_words = ['king', 'woman', 'apple', 'car']
for word in test_words:
    if word in word_list:
        similar = find_most_similar(word, word_list, sim_matrix)
        print(f"\n与 '{word}' 最相似的词:")
        for similar_word, sim in similar:
            print(f"  {similar_word}: {sim:.4f}")

验证类比关系

著名的词类比:"king - man + woman ≈ queen"

python
def verify_word_analogy(embeddings, word1, word2, word3, word4):
    """验证词类比:word1 - word2 + word3 ≈ word4

    king - man + woman ≈ queen
    """
    # 计算向量运算
    v1 = embeddings[word1]
    v2 = embeddings[word2]
    v3 = embeddings[word3]
    v4 = embeddings[word4]

    result = v1 - v2 + v3

    # 计算与预期结果的相似度
    similarity = cosine_similarity(result, v4)
    return similarity

# 示例
analogy_score = verify_word_analogy(embeddings, 'king', 'man', 'woman', 'queen')
print(f"\nking - man + woman ≈ queen 的相似度: {analogy_score:.4f}")

# 更多类比测试
analogies = [
    ('prince', 'man', 'woman', 'princess'),
    ('car', 'bus', 'train', 'vehicle'),
]

for w1, w2, w3, w4 in analogies:
    score = verify_word_analogy(embeddings, w1, w2, w3, w4)
    print(f"{w1} - {w2} + {w3}{w4}: {score:.4f}")

可视化(文本形式)

虽然无法直接绘制图形,但可以用文本形式展示相似度矩阵:

python
def print_similarity_matrix(word_list, similarity_matrix, top_n=10):
    """打印相似度矩阵"""
    n = min(len(word_list), top_n)

    print("\n=== 相似度矩阵(前{}个词)===\n".format(n))

    # 打印表头
    header = "         " + "".join([f"{w:>8}" for w in word_list[:n]])
    print(header)
    print("-" * len(header))

    # 打印矩阵
    for i in range(n):
        row = f"{word_list[i]:>8}"
        for j in range(n):
            row += f"{similarity_matrix[i, j]:>8.2f}"
        print(row)

def print_heatmap(word_list, similarity_matrix):
    """用字符打印热力图"""
    n = len(word_list)
    print("\n=== 相似度热力图 ===\n")

    chars = ' .:-=+*#@'  # 从低到高的字符

    for i in range(n):
        row = f"{word_list[i]:>10} |"
        for j in range(n):
            sim = similarity_matrix[i, j]
            # 将相似度映射到字符
            char_idx = int((sim + 1) / 2 * (len(chars) - 1))
            char_idx = max(0, min(len(chars) - 1, char_idx))
            row += chars[char_idx]
        print(row)

print_similarity_matrix(word_list, sim_matrix)
print_heatmap(word_list, sim_matrix)

最近邻搜索

python
def find_nearest_neighbors(target_word, word_list, embeddings, similarity_matrix, k=5):
    """找到最近邻词

    参数:
        target_word: 目标词
        word_list: 词列表
        embeddings: 嵌入字典
        similarity_matrix: 相似度矩阵
        k: 返回前 k 个最近邻
    """
    if target_word not in word_list:
        return []

    target_idx = word_list.index(target_word)
    similarities = similarity_matrix[target_idx]

    # 排序(降序)
    sorted_indices = np.argsort(similarities)[::-1]

    neighbors = []
    for idx in sorted_indices[1:k+1]:  # 跳过自身
        neighbors.append((word_list[idx], similarities[idx]))

    return neighbors

# 示例
print("\n=== 最近邻搜索 ===")
for word in ['king', 'woman', 'apple', 'car']:
    if word in word_list:
        neighbors = find_nearest_neighbors(word, word_list, embeddings, sim_matrix)
        print(f"\n{word} 的最近邻:")
        for neighbor, sim in neighbors:
            print(f"  {neighbor}: {sim:.4f}")

聚类分析

简单的聚类可以展示词向量的结构:

python
def simple_clustering(word_list, embeddings, n_clusters=4):
    """简单的 K-means 聚类

    注意:这里使用简化的实现,实际应使用 sklearn
    """
    from collections import defaultdict

    # 获取嵌入矩阵
    emb_matrix = np.array([embeddings[w] for w in word_list])

    # 简化的聚类:基于相似度
    sim_matrix = batch_cosine_similarity(emb_matrix, emb_matrix)

    # 简单的基于距离的聚类
    clusters = defaultdict(list)
    assigned = set()

    for i, word in enumerate(word_list):
        if word in assigned:
            continue

        # 找到与这个词最相似的未分配词
        sims = sim_matrix[i]
        sorted_idx = np.argsort(sims)[::-1]

        cluster_id = len(clusters)
        for idx in sorted_idx:
            neighbor_word = word_list[idx]
            if neighbor_word not in assigned:
                clusters[cluster_id].append(neighbor_word)
                assigned.add(neighbor_word)

        if len(clusters) >= n_clusters:
            break

    # 处理剩余的词
    for word in word_list:
        if word not in assigned:
            clusters[len(clusters) % n_clusters].append(word)
            assigned.add(word)

    return dict(clusters)

# 示例
clusters = simple_clustering(word_list, embeddings, n_clusters=4)
print("\n=== 简单聚类结果 ===")
for cluster_id, words in clusters.items():
    print(f"Cluster {cluster_id}: {words}")

常见误区

误区一:忘记归一化

比较向量相似度前应该归一化:

python
# 错误:直接比较
dot_product = np.dot(v1, v2)

# 正确:归一化后比较
norm1 = np.linalg.norm(v1)
norm2 = np.linalg.norm(v2)
cosine_sim = dot_product / (norm1 * norm2 + 1e-8)

误区二:使用欧氏距离代替余弦相似度

欧氏距离和余弦相似度在某些情况下不等价:

python
def euclidean_distance(v1, v2):
    """欧氏距离"""
    return np.linalg.norm(v1 - v2)

# 余弦相似度高不代表欧氏距离近
# 例如:两个方向相同但长度不同的向量,余弦相似度=1,欧氏距离很大
v1 = np.array([1, 0])
v2 = np.array([100, 0])
print(f"余弦相似度: {cosine_similarity(v1, v2):.4f}")
print(f"欧氏距离: {euclidean_distance(v1, v2):.4f}")

误区三:在高维空间中使用欧氏距离

在高维空间中,欧氏距离会趋于相近(维度诅咒)。余弦相似度更适合高维数据:

python
# 高维空间中距离的稀释效应
high_dim = 1000
v1 = np.random.randn(high_dim)
v2 = np.random.randn(high_dim)

# 随机向量的余弦相似度通常接近 0
print(f"高维随机向量余弦相似度: {cosine_similarity(v1, v2):.4f}")
print(f"高维随机向量欧氏距离: {euclidean_distance(v1, v2):.4f}")

API 总结

函数描述
cosine_similarity(v1, v2)计算两个向量的余弦相似度
batch_cosine_similarity(v1, v2)批量计算余弦相似度
find_most_similar(word, ...)找最相似的词
verify_word_analogy(...)验证词类比关系

词向量相似度分析是理解词嵌入质量的重要工具。通过这些技术,我们可以验证嵌入是否学习到了有意义的语义关系。

基于 MIT 许可发布