# 导入所需的库
import numpy as np
from keras.models import Model
from keras.layers import Input, Dense, Embedding, MultiHeadAttention
from keras.optimizers import Adam
# 定义模型参数
vocab_size = 10000 # 词汇表大小
embedding_dim = 256 # 嵌入维度
num_heads = 8 # 多头注意力的头数
max_seq_length = 128 # 序列的最大长度
# 定义输入层
input_layer = Input(shape=(max_seq_length,))
# 定义嵌入层
embedding_layer = Embedding(vocab_size, embedding_dim)(input_layer)
# 定义多头注意力层
attention_layer = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(embedding_layer, embedding_layer)
# 定义前馈神经网络层
ffn_layer = Dense(embedding_dim, activation='relu')(attention_layer)
ffn_layer = Dense(embedding_dim)(ffn_layer)
# 定义输出层
output_layer = Dense(vocab_size, activation='softmax')(ffn_layer)
# 构建模型
model = Model(inputs=input_layer, outputs=output_layer)
# 编译模型
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
# 打印模型结构
model.summary()
参数说明
vocab_size = 10000
:定义词汇表的大小为10000embedding_dim = 256
:定义嵌入层的维度为256num_heads = 8
:定义多头注意力的头数为8max_seq_length = 128
:定义序列的最大长度为128input_layer = Input(shape=(max_seq_length,))
:定义输入层,输入的形状为序列的最大长度embedding_layer = Embedding(vocab_size, embedding_dim)(input_layer)
:定义嵌入层,将输入的整数序列转换为固定大小的向量表示。attention_layer = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(embedding_layer, embedding_layer)
:定义多头注意力层,将嵌入层的输出作为查询和键值进行注意力计算。ffn_layer = Dense(embedding_dim, activation='relu')(attention_layer)
:定义前馈神经网络层,使用ReLU激活函数。ffn_layer = Dense(embedding_dim)(ffn_layer)
:再次定义全连接层,输出维度与嵌入层相同。output_layer = Dense(vocab_size, activation='softmax')(ffn_layer)
:定义输出层,将前馈神经网络层的输出转换为概率分布,用于预测下一个词。model = Model(inputs=input_layer, outputs=output_layer)
:使用输入层和输出层构建模型。model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
:编译模型,使用Adam优化器,损失函数为分类交叉熵,评估指标为准确率。model.summary()
:打印模型的结构,包括各层的参数数量和形状。
模型结构
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 128)] 0 []
embedding (Embedding) (None, 128, 256) 2560000 ['input_1[0][0]']
multi_head_attention (Mult (None, 128, 256) 2103552 ['embedding[0][0]',
iHeadAttention) 'embedding[0][0]']
dense (Dense) (None, 128, 256) 65792 ['multi_head_attention[0][0]']
dense_1 (Dense) (None, 128, 256) 65792 ['dense[0][0]']
dense_2 (Dense) (None, 128, 10000) 2570000 ['dense_1[0][0]']
==================================================================================================
Total params: 7365136 (28.10 MB)
Trainable params: 7365136 (28.10 MB)
Non-trainable params: 0 (0.00 Byte)