准备

英文版本有四个，但是中文的目前只有一个版本BERT_base版本
12层，768个隐藏单元，12个注意力头，110M参数
同时还有包含104中语言，12层，768隐藏单元，12注意力头，110M参数

下载地址

google代码

下载下来文件的解释：

vocab.txt是模型的词典
bert_config.json是超参数的配置
bert_model.ckpt.*预训练好的模型

环境：

tensorflow1.12+

modeling.py

首先是对config(BertConfig对象)深度拷贝一份，如果不是训练，那么把dropout都置为零。如果输入的input_mask为None，那么构造一个shape合适值全为1的input_mask，这表示输入都是”真实”的输入，没有padding的内容。如果token_type_ids为None，那么构造一个shape合适并且值全为0的tensor，表示所有Token都属于第一个句子。然后使用embedding_lookup函数构造词的Embedding，用embedding_postprocessor函数增加位置embeddings和token type的embeddings，然后是layer normalize和dropout。接着用transformer_model函数构造多个Transformer SubLayer然后stack在一起。得到的all_encoder_layers是一个list，长度为num_hidden_layers（默认12），每一层对应一个值。每一个值都是一个shape为[batch_size, seq_length, hidden_size]的tensor。 self.sequence_output是最后一层的输出，shape是[batch_size, seq_length, hidden_size]。first_token_tensor是第一个Token([CLS])最后一层的输出，shape是[batch_size, hidden_size]。最后对self.sequence_output再加一个线性变换，得到的tensor仍然是[batch_size, hidden_size]。

BertConfig是为加载配置文件所定义的对象

vocab_size,  # 输入的词典词数量
hidden_size=768,  # 隐藏单元数 
num_hidden_layers=12,  # 堆叠层数
num_attention_heads=12,  # 多头注意力的头数
intermediate_size=3072,  # 前向传播的layer大小
hidden_act="gelu",  # 激活函数
hidden_dropout_prob=0.1,  # 全连接层和pooler层的dropout
attention_probs_dropout_prob=0.1,  # 乘法attention时，softmax后dropout概率
max_position_embeddings=512,  # 输入句子的最大长度
type_vocab_size=2,  # Segment A和 Segment B
initializer_range=0.02:  # 随机初始化正态分布的参数

BertModel对象：

class BertModel(object):
    """BERT model ("Bidirectional Encoder Representations from Transformers").
    Example usage:

    # Already been converted into WordPiece token ids
    input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
    # 1表示原始有token，0表示是padding出来的
    input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
    token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])

    config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
      num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

    model = modeling.BertModel(config=config, is_training=True,
      input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)

    label_embeddings = tf.get_variable(...)
    pooled_output = model.get_pooled_output()
    logits = tf.matmul(pooled_output, label_embeddings)
    """
	    def __init__(self,
                 config,  # BertConfig对象
                 is_training,  # 辨识是否为训练，影响dropout
                 input_ids,  # int32 Tensor  shape是[batch_size, seq_length]
                 input_mask=None,  # (可选) int32 Tensor shape是[batch_size, seq_length]
                 token_type_ids=None,  # (可选) int32 Tensor shape是[batch_size, seq_length]
                 use_one_hot_embeddings=False,  # (可选) bool
                 # 如果True，使用矩阵乘法实现提取词的Embedding；否则用tf.embedding_lookup()
                 # 对于TPU，使用前者更快，对于GPU和CPU，后者更快。
                 scope=None):
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)

        # 管理传给get_variable()的变量名称的作用域
        with tf.variable_scope(scope, default_name="bert"):
            with tf.variable_scope("embeddings"):
                # 词的Embedding lookup
                (self.embedding_output, self.embedding_table) = embedding_lookup(
                    input_ids=input_ids,
                    vocab_size=config.vocab_size,
                    embedding_size=config.hidden_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name="word_embeddings",
                    use_one_hot_embeddings=use_one_hot_embeddings)

                # 增加位置embeddings和token type的embeddings，然后是
                # layer normalize和dropout。
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope("encoder"):
                # 把shape为[batch_size, seq_length]的2D mask变成
                # shape为[batch_size, seq_length, seq_length]的3D mask
                # 以便后向的attention计算
                attention_mask = create_attention_mask_from_input_mask(
                    input_ids, input_mask)

                # 多个Transformer模型stack起来。
                # all_encoder_layers是一个list，长度为num_hidden_layers（默认12），每一层对应一个值。
                # 每一个值都是一个shape为[batch_size, seq_length, hidden_size]的tensor。
                self.all_encoder_layers = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)
            # `sequence_output` 是最后一层的输出，shape是[batch_size, seq_length, hidden_size]
            self.sequence_output = self.all_encoder_layers[-1]

            with tf.variable_scope("pooler"):
                # 取最后一层的第一个时刻[CLS]对应的tensor
                # 从[batch_size, seq_length, hidden_size]变成[batch_size, hidden_size]
                # sequence_output[:, 0:1, :]得到的是[batch_size, 1, hidden_size]
                # 我们需要用squeeze把第二维去掉。
                # tf.squeeze()函数的作用是从tensor中删除所有大小(szie)是1的维度
                first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
                # 然后再加一个全连接层，输出仍然是[batch_size, hidden_size]
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=create_initializer(config.initializer_range))

    def get_pooled_output(self):
        return self.pooled_output

    def get_sequence_output(self):
        """Gets final hidden layer of encoder.

        Returns:
          float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
          to the final hidden of the transformer encoder.
        """
        return self.sequence_output

    def get_all_encoder_layers(self):
        return self.all_encoder_layers

    def get_embedding_output(self):
        """Gets output of the embedding lookup (i.e., input to the transformer).

        Returns:
          float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
          to the output of the embedding layer, after summing the word
          embeddings with the positional embeddings and the token type embeddings,
          then performing layer normalization. This is the input to the transformer.
        """
        return self.embedding_output

    def get_embedding_table(self):
        return self.embedding_table

gelu

"""Gaussian Error Linear Unit.

    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
      x: float Tensor to perform activation.

    Returns:
      `x` with the GELU activation applied.
    """
    cdf = 0.5 * (1.0 + tf.tanh(
        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf

embedding_lookup

embedding_lookup函数用于实现Embedding，它有两种方式：使用tf.nn.embedding_lookup和矩阵乘法(one_hot_embedding=True)。前者适合于CPU与GPU，后者适合于TPU。所谓的one-hot方法是把输入id表示成one-hot的向量，当然输入id序列就变成了one-hot的矩阵，然后乘以Embedding矩阵。

def embedding_lookup(input_ids,  # int32 Tensor shape为[batch_size, seq_length]，表示WordPiece的id
                     vocab_size,  # 词典大小
                     embedding_size=128,  # embedding后向量的大小
                     initializer_range=0.02,  # 随机初始化的范围
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=False):
    # 这个函数假设输入的shape是[batch_size, seq_length, num_inputs]
    # 普通的Embeding一般假设输入是[batch_size, seq_length]，
    # 增加num_inputs这一维度的目的是为了一次计算更多的Embedding
    # 但目前的代码并没有用到，传入的input_ids都是2D的，这增加了代码的阅读难度。

    # 如果输入是[batch_size, seq_length]，
    # 那么我们把它 reshape成[batch_size, seq_length, 1]
    if input_ids.shape.ndims == 2:
        input_ids = tf.expand_dims(input_ids, axis=[-1])
    # 构造Embedding矩阵，shape是[vocab_size, embedding_size]
    embedding_table = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=create_initializer(initializer_range))

    flat_input_ids = tf.reshape(input_ids, [-1])
    if use_one_hot_embeddings:
        one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
        output = tf.matmul(one_hot_input_ids, embedding_table)
    else:
        output = tf.gather(embedding_table, flat_input_ids)

    input_shape = get_shape_list(input_ids)
    # 把输出从[batch_size, seq_length, num_inputs(这里总是1), embedding_size]
    # 变成[batch_size, seq_length, num_inputs*embedding_size]
    output = tf.reshape(output,
                        input_shape[0:-1] + [input_shape[-1] * embedding_size])
    return (output, embedding_table)

embedding_postprocessor

def embedding_postprocessor(input_tensor,  # shape为[batch_size, seq_length, embedding_size]
                            use_token_type=False,  # 是否增加`token_type_ids`的Embedding 当前词隶属那个句子
                            token_type_ids=None,  # shape为[batch_size, seq_length] 如果`use_token_type`为True则必须有值
                            token_type_vocab_size=16,  # int Token Type的个数，通常是2
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,  # 是否使用位置Embedding
                            position_embedding_name="position_embeddings",  # 位置embedding的名字
                            initializer_range=0.02,  # 初始化范围
                            max_position_embeddings=512,  # 位置编码的最大长度，可以比最大序列长度大，但是不能比它小。
                            dropout_prob=0.1):
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # 因为Token Type通常很小(2)，所以直接用矩阵乘法(one-hot)更快
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        # x 大于y将会抛出异常
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # 位置Embedding是可以学习的参数，因此我们创建一个[max_position_embeddings, width]的矩阵
            # 但实际输入的序列可能并不会到max_position_embeddings(512)，为了提高训练速度，
            # 我们通过tf.slice取出[0, 1, 2, ..., seq_length-1]的部分。
            # 因此我们需要扩展位置编码为[1, seq_length, width]
            # 然后就能通过broadcasting加上去了。
            # 从full_position_embeddings的[0,0]也就是第0个位置开始，第一个维度取seq_length个元素，第二个全取
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            # 默认情况下position_broadcast_shape为[1, 128, 768]
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            # output是[8, 128, 768], position_embeddings是[1, 128, 768]
            # 因此可以通过broadcasting相加。
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output

create_attention_mask_from_input_mask

def create_attention_mask_from_input_mask(from_tensor, to_mask):
    """
      input_ids=[[1,2,3,0,0],[1,3,5,6,1]]
     input_mask=[[1,1,1,0,0],[1,1,1,1,1]]
     比如:broadcast_ones的shape是[2, 5, 1]，值全是1，而to_mask是[[1,1,1,0,0],[1,1,1,1,1]]
     shape是[2, 5]，reshape为[2, 1, 5]。然后broadcast_ones * to_mask就得到[2, 5, 5]，
     正是我们需要的两个Mask矩阵，可以验证。注意[batch, A, B]*[batch, B, C]=[batch, A, C]，
     我们可以认为是batch个[A, B]的矩阵乘以batch个[B, C]的矩阵。
     [
        [1, 1, 1, 0, 0], #它表示第1个词可以attend to 3个词
        [1, 1, 1, 0, 0], #它表示第2个词可以attend to 3个词
        [1, 1, 1, 0, 0], #它表示第3个词可以attend to 3个词
        [1, 1, 1, 0, 0], #无意义，因为输入第4个词是padding的0
        [1, 1, 1, 0, 0]  #无意义，因为输入第5个词是padding的0
    ]

    [
        [1, 1, 1, 1, 1], # 它表示第1个词可以attend to 5个词
        [1, 1, 1, 1, 1], # 它表示第2个词可以attend to 5个词
        [1, 1, 1, 1, 1], # 它表示第3个词可以attend to 5个词
        [1, 1, 1, 1, 1], # 它表示第4个词可以attend to 5个词
        [1, 1, 1, 1, 1]	 # 它表示第5个词可以attend to 5个词
    ]
    """
    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]

    to_shape = get_shape_list(to_mask, expected_rank=2)
    to_seq_length = to_shape[1]

    to_mask = tf.cast(
        tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)

    broadcast_ones = tf.ones(
        shape=[batch_size, from_seq_length, 1], dtype=tf.float32)

    # Here we broadcast along two dimensions to create the mask.
    mask = broadcast_ones * to_mask

    return mask

attention_layer

"""
这个函数实现论文"Attention
is all you Need"里的multi-head attention。
如果`from_tensor`和`to_tensor`是同一个tensor，那么就实现Self-Attention。
`from_tensor`的每个时刻都会attends to `to_tensor`，
也就是用from的Query去乘以所有to的Key，得到weight，然后把所有to的Value加权求和起来。
这个函数首先把`from_tensor`变换成一个"query" tensor，
然后把`to_tensor`变成"key"和"value" tensors。
总共有`num_attention_heads`组Query、Key和Value，
每一个Query，Key和Value的shape都是[batch_size(8), seq_length(128), size_per_head(512/8=64)].
然后计算query和key的内积并且除以size_per_head的平方根(8)。
然后softmax变成概率，最后用概率加权value得到输出。
因为有多个Head，每个Head都输出[batch_size, seq_length, size_per_head]，
最后把8个Head的结果concat起来，就最终得到[batch_size(8), seq_length(128), size_per_head*8=512] 
实际上我们是把这8个Head的Query，Key和Value都放在一个Tensor里面的，
因此实际通过transpose和reshape就达到了上面的效果。
"""


def attention_layer(from_tensor,  # shape [batch_size, from_seq_length, from_width]
                    to_tensor,  # shape [batch_size, to_seq_length, to_width].
                    attention_mask=None,  # shape[batch_size,from_seq_length,to_seq_length]。值可以是0或者1，
                    # 在计算attention score的时候，我们会把0变成负无穷(实际是一个绝对值很大的负数)，而1不变，
                    # 这样softmax的时候进行exp的计算，前者就趋近于零，从而间接实现Mask的功能。
                    num_attention_heads=1,  # Attention heads的数量。
                    size_per_head=512,  # 每个head的size
                    query_act=None,  # query变换的激活函数
                    key_act=None,  # key变换的激活函数
                    value_act=None,  # value变换的激活函数
                    attention_probs_dropout_prob=0.0,  # attention的Dropout概率
                    initializer_range=0.02,
                    do_return_2d_tensor=False,
                    # 如果True，返回2D的Tensor其shape是[batch_size * from_seq_length, num_attention_heads * size_per_head]；
                    # 否则返回3D的Tensor其shape为[batch_size, from_seq_length, num_attention_heads * size_per_head].
                    batch_size=None,  # 如果输入是3D的，那么batch就是第一维，但是可能3D的压缩成了2D的，所以需要告诉函数batch_size
                    from_seq_length=None,  # 需要告诉函数from_seq_length
                    to_seq_length=None):  # 同上，to_seq_length
    # float Tensor，shape [batch_size,from_seq_length,num_attention_heads * size_per_head]。
    # 如果`do_return_2d_tensor`为True，则返回的shape是
    # [batch_size * from_seq_length, num_attention_heads * size_per_head].

    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                             seq_length, width):
        output_tensor = tf.reshape(
            input_tensor, [batch_size, seq_length, num_attention_heads, width])

        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")
    # 如果输入是3D的(没有压缩)，那么我们可以推测出batch_size、from_seq_length和to_seq_length
    # 即使参数传入也会被覆盖。
    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    # 如果是压缩成2D的，那么一定要传入这3个参数，否则抛异常。
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")
    #   B = batch size (number of sequences) 默认配置是8
    #   F = `from_tensor` sequence length 默认配置是128
    #   T = `to_tensor` sequence length 默认配置是128
    #   N = `num_attention_heads` 默认配置是12
    #   H = `size_per_head` 默认配置是64

    # 把from和to压缩成2D的。
    # [8*128, 768]
    from_tensor_2d = reshape_to_matrix(from_tensor)
    # [8*128, 768]
    to_tensor_2d = reshape_to_matrix(to_tensor)

    # 计算Query `query_layer` = [B*F, N*H] =[8*128, 12*64]
    # batch_size=8，共128个时刻，12和head，每个head的query向量是64
    # 因此最终得到[8*128, 12*64]
    query_layer = tf.layers.dense(
        from_tensor_2d,
        num_attention_heads * size_per_head,
        activation=query_act,
        name="query",
        kernel_initializer=create_initializer(initializer_range))

    # 和query类似，`key_layer` = [B*T, N*H]
    key_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=key_act,
        name="key",
        kernel_initializer=create_initializer(initializer_range))

    # 同上，`value_layer` = [B*T, N*H]
    value_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=value_act,
        name="value",
        kernel_initializer=create_initializer(initializer_range))

    # 把query从[B*F, N*H] =[8*128, 12*64]变成[B, N, F, H]=[8, 12, 128, 64]
    query_layer = transpose_for_scores(query_layer, batch_size,
                                       num_attention_heads, from_seq_length,
                                       size_per_head)

    # 同上，key也变成[8, 12, 128, 64]
    key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
                                     to_seq_length, size_per_head)

    # 计算query和key的内积，得到attention scores.
    # [8, 12, 128, 64]*[8, 12, 64, 128]=[8, 12, 128, 128]
    # 最后两维[128, 128]表示from的128个时刻attend to到to的128个score。
    # `attention_scores` = [B, N, F, T]
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # 从[8, 128, 128]变成[8, 1, 128, 128]
        # `attention_mask` = [B, 1, F, T]
        attention_mask = tf.expand_dims(attention_mask, axis=[1])

        # 这个小技巧前面也用到过，如果mask是1，那么(1-1)*-10000=0，adder就是0,
        # 如果mask是0，那么(1-0)*-10000=-10000。
        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

        # 我们把adder加到attention_score里，mask是1就相当于加0，mask是0就相当于加-10000。
        # 通常attention_score都不会很大，因此mask为0就相当于把attention_score设置为负无穷
        # 后面softmax的时候就趋近于0，因此相当于不能attend to Mask为0的地方。
        attention_scores += adder

    # softmax
    # `attention_probs` = [B, N, F, T] =[8, 12, 128, 128]
    attention_probs = tf.nn.softmax(attention_scores)

    # 对attention_probs进行dropout，这虽然有点奇怪，但是Transformer的原始论文就是这么干的。
    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

    # 把`value_layer` reshape成[B, T, N, H]=[8, 128, 12, 64]
    value_layer = tf.reshape(
        value_layer,
        [batch_size, to_seq_length, num_attention_heads, size_per_head])

    # `value_layer`变成[B, N, T, H]=[8, 12, 128, 64]
    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

    # 计算`context_layer` = [8, 12, 128, 128]*[8, 12, 128, 64]=[8, 12, 128, 64]=[B, N, F, H]
    context_layer = tf.matmul(attention_probs, value_layer)

    # `context_layer` 变换成 [B, F, N, H]=[8, 128, 12, 64]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

    if do_return_2d_tensor:
        # `context_layer` = [B*F, N*H]
        context_layer = tf.reshape(
            context_layer,
            [batch_size * from_seq_length, num_attention_heads * size_per_head])
    else:
        # `context_layer` = [B, F, N*H]
        context_layer = tf.reshape(
            context_layer,
            [batch_size, from_seq_length, num_attention_heads * size_per_head])

    return context_layer
    

transformer_model

def transformer_model(input_tensor,  # shape为[batch_size, seq_length, hidden_size]
                      attention_mask=None,  # shape [batch_size, seq_length, seq_length], 1表示可以attend to，0表示不能。
                      hidden_size=768,  # Transformer隐单元个数
                      num_hidden_layers=12,  # 有多少个SubLayer
                      num_attention_heads=12,  # Transformer Attention Head个数。
                      intermediate_size=3072,  # 全连接层的隐单元个数
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,  # Self-Attention层残差之前的Dropout概率
                      attention_probs_dropout_prob=0.1,  # attention的Dropout概率
                      initializer_range=0.02,
                      do_return_all_layers=False):  # 返回所有层的输出还是最后一层的输出
    # 如果do_return_all_layers True，返回最后一层的输出，是一个Tensor，shape为[batch_size, seq_length, hidden_size]；
    # 否则返回所有层的输出，是一个长度为num_hidden_layers的list，list的每一个元素都是[batch_size, seq_length, hidden_size]。
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads))
    # 因为最终要输出hidden_size，总共有num_attention_heads个Head，因此每个Head输出
    # 为hidden_size / num_attention_heads
    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # 因为需要残差连接，我们需要把输入加到Self-Attention的输出，因此要求它们的shape是相同的。
    if input_width != hidden_size:
        raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                         (input_width, hidden_size))

    # 为了避免在2D和3D之间来回reshape，我们统一把所有的3D Tensor用2D来表示。
    # 虽然reshape在GPU/CPU上很快，但是在TPU上却不是这样，这样做的目的是为了优化TPU
    # input_tensor是[8, 128, 768], prev_output是[8*128, 768]=[1024, 768]
    prev_output = reshape_to_matrix(input_tensor)

    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        # 每一层都有自己的variable scope
        with tf.variable_scope("layer_%d" % layer_idx):
            layer_input = prev_output
            # attention层
            with tf.variable_scope("attention"):
                attention_heads = []
                with tf.variable_scope("self"):
                    attention_head = attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length)
                    attention_heads.append(attention_head)

                # attention_output = None
                if len(attention_heads) == 1:
                    attention_output = attention_heads[0]
                else:
                    # 如果有多个head，那么需要把多个head的输出concat起来
                    attention_output = tf.concat(attention_heads, axis=-1)

                # 使用线性变换把前面的输出变成`hidden_size`，然后再加上`layer_input`(残差连接)
                with tf.variable_scope("output"):
                    attention_output = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=create_initializer(initializer_range))
                    attention_output = dropout(attention_output, hidden_dropout_prob)
                    # 残差连接再加上layer norm。
                    attention_output = layer_norm(attention_output + layer_input)

            # 全连接层
            with tf.variable_scope("intermediate"):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=create_initializer(initializer_range))

            # 然后是用一个线性变换把大小变回`hidden_size`，这样才能加残差连接
            with tf.variable_scope("output"):
                layer_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=create_initializer(initializer_range))
                layer_output = dropout(layer_output, hidden_dropout_prob)
                layer_output = layer_norm(layer_output + attention_output)
                prev_output = layer_output
                all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        final_outputs = []
        for layer_output in all_layer_outputs:
            final_output = reshape_from_matrix(layer_output, input_shape)
            final_outputs.append(final_output)
        return final_outputs
    else:
        final_output = reshape_from_matrix(prev_output, input_shape)
        return final_output

tokenization

主要的流程是清除无用字符、所欲的字符采用一种形式。然后先用BasicTokenizer（基于空格标点等）来进行基础分词，然后再利用WordpieceTokenizer进行分词。词典中有预留的unused token是为了处理文本中没有的特殊字符。

def convert_to_unicode(text):
    """
    把字符串变成unicode的字符串。这是为了兼容Python2和Python3，因为Python3的str就是unicode，而Python2的str其实是bytearray，Python2却有一个专门的unicode类型。
    """
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")
        
def load_vocab(vocab_file):
    """加载词典"""
    # 生成有序的字典，按照插入的顺序排序
    vocab = collections.OrderedDict()
    index = 0
    with tf.gfile.GFile(vocab_file, "r") as reader:
        while True:
            token = convert_to_unicode(reader.readline())
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab
    
class WordpieceTokenizer(object):
    """WordpieceTokenizer分词"""

    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
        self.vocab = vocab
        self.unk_token = unk_token 
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        # 把一段文字切分成word piece。这其实是贪心的最大正向匹配算法。
        # 比如：
        # input = "unaffable"
        # output = ["un", "##aff", "##able"]
        """
        比如假设输入是”unaffable”。我们跳到while循环部分，这是start=0，end=len(chars)=9，也就是先看看unaffable在不在词典里，
        如果在，那么直接作为一个WordPiece，如果不再，那么end-=1，也就是看unaffabl在不在词典里，最终发现”un”在词典里，把un加到结果里。
        接着start=2，看affable在不在，不在再看affabl，…，最后发现 ##aff 在词典里。注意：##表示这个词是接着前面的，
        这样使得WordPiece切分是可逆的——我们可以恢复出“真正”的词。
        """

        text = convert_to_unicode(text)

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

使用

是否调整bert参数：

第一种：
在optimization中的apply_gradients方法中设置：
        for (grad, param) in grads_and_vars:
            if grad is None or param is None:  # or param.name.startswith("bert")  将这个判断条件加入到之前，就不会训练Bert的参数。
                # 同时需要在创建模型的时候令is_train=False
                # https://github.com/huwenxianglyy/bert-use-demo/blob/master/demo3.py
                continue

第二种
创建模型的时候is_train 设为False

pretrain

数据格式为每一行是一个句子，文档与文档之间通过空行隔开。每个文档可能有多个句子。第一步需要使用create_pretraining_data.py将文本文件转化为TFRecord格式，便于后续的pretrain。

create_pretraining_data

python create_pretraining_data.py \
	--input_file=./sample_text.txt \
	--output_file=/tmp/tf_examples.tfrecord \
	--vocab_file=$BERT_BASE_DIR/vocab.txt \
	--do_lower_case=True \
	--max_seq_length=128 \
	--max_predictions_per_seq=20 \
	--masked_lm_prob=0.15 \
	--random_seed=12345 \
	--dupe_factor=5

max_seq_length Token序列的最大长度
max_predictions_per_seq 最多生成多少个MASK
masked_lm_prob 多少比例的Token变成MASK
dupe_factor 一个文档重复多少次

参数dupe_factor，比如一个句子”it is a good day”，为了充分利用数据，我们可以多次随机的生成MASK，比如第一次可能生成”it is a [MASK] day”，第二次可能生成”it [MASK] a good day”。这个参数控制重复的次数。

masked_lm_prob就是论文里的参数15%。max_predictions_per_seq是一个序列最多MASK多少个Token，它通常等于max_seq_length * masked_lm_prob。

TrainingInstance

例子：假设原始两个句子为：”it is a good day”和”I want to go out”，那么处理后的TrainingInstance可能为：

tokens = ["[CLS], "it", "is" "a", "[MASK]", "day", "[SEP]", "I", "apple", "to", "go", "out", "[SEP]"]
segment_ids=[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
is_random_next=False
masked_lm_positions=[4, 8, 9] 
   表示Mask后为["[CLS], "it", "is" "a", "[MASK]", "day", "[SEP]", "I", "[MASK]", "to", "go", "out", "[SEP]"]
masked_lm_labels=["good", "want", "to"]

is_random_next表示这两句话是有关联的，预测句子关系的分类器应该把这个输入判断为1。masked_lm_positions记录哪些位置被Mask了，而masked_lm_labels记录被Mask之前的词。

注意：tokens已经处理过了，good被替换成[MASK]，而want被替换成apple，而to还是被替换成它自己，原因前面的理论部分已经介绍过了。因此根据masked_lm_positions、masked_lm_labels和tokens是可以恢复出原始(分词后的)句子的。

class TrainingInstance(object):
	def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
				is_random_next):
		self.tokens = tokens
		self.segment_ids = segment_ids
		self.is_random_next = is_random_next
		self.masked_lm_positions = masked_lm_positions
		self.masked_lm_labels = masked_lm_labels

create_training_instances

调用create_instances_from_document来从一个文档里抽取多个训练数据(TrainingInstance)。

def create_training_instances(input_files, tokenizer, max_seq_length,
			dupe_factor, short_seq_prob, masked_lm_prob,
			max_predictions_per_seq, rng):
	"""从原始文本创建`TrainingInstance`"""
	all_documents = [[]]
	
	# 输入文件格式： 
	# (1) 每行一个句子。这应该是实际的句子，不应该是整个段落或者段落的随机片段(span)，因为我们需
	# 要使用句子边界来做下一个句子的预测。 
	# (2) 文档之间有一个空行。我们会认为同一个文档的相邻句子是有关系的。
	
	# 下面的代码读取所有文件，然后根据空行切分Document
	# all_documents是list的list，第一层list表示document，第二层list表示document里的多个句子。 
	for input_file in input_files:
		with tf.gfile.GFile(input_file, "r") as reader:
			while True:
				line = tokenization.convert_to_unicode(reader.readline())
				if not line:
					break
				line = line.strip()
				
				# 空行表示旧文档的结束和新文档的开始。
				if not line:
					#添加一个新的空文档
					all_documents.append([])
				tokens = tokenizer.tokenize(line)
				if tokens:
					all_documents[-1].append(tokens)
	
	# 删除空文档
	all_documents = [x for x in all_documents if x]
	rng.shuffle(all_documents)
	
	vocab_words = list(tokenizer.vocab.keys())
	instances = []
	# 重复dup_factor次
	for _ in range(dupe_factor):
		# 遍历所有文档
		for document_index in range(len(all_documents)):
			# 从一个文档(下标为document_index)里抽取多个TrainingInstance
			instances.extend(create_instances_from_document(
				all_documents, document_index, max_seq_length, short_seq_prob,
				masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
	
	rng.shuffle(instances)
	return instances

create_instances_from_document

普通的语言模型只要求连续的字符串就行，通常是把所有的文本(比如维基百科的内容)拼接成一个很大很大的文本文件，然后训练的时候随机的从里面抽取固定长度的字符串作为一个”句子”。但是BERT要求我们的输入是一个一个的Document，每个Document有很多句子，这些句子是连贯的真实的句子，需要正确的分句，而不能随机的(比如按照固定长度)切分句子。

def create_instances_from_document(
        all_documents, document_index, max_seq_length, short_seq_prob,
        masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
  """从一个文档里创建多个`TrainingInstance`。"""
  document = all_documents[document_index]
  
  # 为[CLS], [SEP], [SEP]预留3个位置。
  max_num_tokens = max_seq_length - 3
  
  # 我们通常希望Token序列长度为最大的max_seq_length，否则padding后的计算是无意义的，浪费计
  # 算资源。但是有的时候我们有希望生成一些短的句子，因为在实际应用中会有短句，如果都是
  # 长句子，那么就很容易出现Mismatch，所有我们以short_seq_prob == 0.1 == 10%的概率生成
  # 短句子。
  target_seq_length = max_num_tokens
  # 以0.1的概率生成随机(2-max_num_tokens)的长度。
  if rng.random() < short_seq_prob:
    target_seq_length = rng.randint(2, max_num_tokens)
  
  # 我们不能把一个文档的所有句子的Token拼接起来，然后随机的选择两个片段。
  # 因为这样很可能这两个片段是同一个句子(至少很可能第二个片段的开头和第一个片段的结尾是同一个
  # 句子)，这样预测是否相关句子的任务太简单，学习不到深层的语义关系。
  # 这里我们使用"真实"的句子边界。
  instances = []
  current_chunk = []
  current_length = 0
  i = 0
  while i < len(document):
    segment = document[i]
    current_chunk.append(segment)
    current_length += len(segment)
    if i == len(document) - 1 or current_length >= target_seq_length:
      if current_chunk:
        # `a_end`是第一个句子A(在current_chunk里)结束的下标 
        a_end = 1
        # 随机选择切分边界
        if len(current_chunk) >= 2:
          a_end = rng.randint(1, len(current_chunk) - 1)
        
        tokens_a = []
        for j in range(a_end):
          tokens_a.extend(current_chunk[j])
        
        tokens_b = []
        # 是否Random next
        is_random_next = False
        if len(current_chunk) == 1 or rng.random() < 0.5:
          is_random_next = True
          target_b_length = target_seq_length - len(tokens_a)
          
          # 随机的挑选另外一篇文档的随机开始的句子
          # 但是理论上有可能随机到的文档就是当前文档，因此需要一个while循环
          # 这里只while循环10次，理论上还是有重复的可能性，但是我们忽略
          
          for _ in range(10):
            random_document_index = rng.randint(0, len(all_documents) - 1)
            # 不是当前文档，则找到了random_document_index
            if random_document_index != document_index:
              break
          
          # 随机挑选的文档
          random_document = all_documents[random_document_index]
          # 随机选择开始句子
          random_start = rng.randint(0, len(random_document) - 1)
          # 把Token加到tokens_b里，如果Token数量够了(target_b_length)就break。
          for j in range(random_start, len(random_document)):
            tokens_b.extend(random_document[j])
            if len(tokens_b) >= target_b_length:
              break
          # 之前我们虽然挑选了len(current_chunk)个句子，但是a_end之后的句子替换成随机的其它
          # 文档的句子，因此我们并没有使用a_end之后的句子，因此我们修改下标i，使得下一次循环
          # 可以再次使用这些句子(把它们加到新的chunk里)，避免浪费。
          num_unused_segments = len(current_chunk) - a_end
          i -= num_unused_segments
        # 真实的下一句
        else:
          is_random_next = False
          for j in range(a_end, len(current_chunk)):
            tokens_b.extend(current_chunk[j])
        
        # 如果太多了，随机去掉一些。  
        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
        
        tokens = []
        segment_ids = []
        
        # 处理句子A
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
          tokens.append(token)
          segment_ids.append(0)
        # A的结束
        tokens.append("[SEP]")
        segment_ids.append(0)
        
        # 处理句子B
        for token in tokens_b:
          tokens.append(token)
          segment_ids.append(1)
        # B的结束
        tokens.append("[SEP]")
        segment_ids.append(1)
        
        (tokens, masked_lm_positions,masked_lm_labels) = create_masked_lm_predictions(
            tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
        instance = TrainingInstance(
            tokens=tokens,
            segment_ids=segment_ids,
            is_random_next=is_random_next,
            masked_lm_positions=masked_lm_positions,
            masked_lm_labels=masked_lm_labels)
        instances.append(instance)
      current_chunk = []
      current_length = 0
    i += 1
  
  return instances

create_masked_lm_predictions

create_masked_lm_predictions来随机的选择某些Token，把它变成[MASK]

def create_masked_lm_predictions(tokens, masked_lm_prob,
		max_predictions_per_seq, vocab_words, rng):
	# 首先找到可以被替换的下标，[CLS]和[SEP]是不能用于MASK的。
	cand_indexes = []
	for (i, token) in enumerate(tokens):
		if token == "[CLS]" or token == "[SEP]":
			continue
		cand_indexes.append(i)
	
	# 随机打散
	rng.shuffle(cand_indexes)
	
	output_tokens = list(tokens)
	
	# 构造一个namedtuple，包括index和label两个属性。
	masked_lm = collections.namedtuple("masked_lm", ["index", "label"])
	
	# 需要被模型预测的Token个数：min(max_predictions_per_seq(20)，实际Token数*15%)
	num_to_predict = min(max_predictions_per_seq,
			max(1, int(round(len(tokens) * masked_lm_prob))))
	
	masked_lms = []
	covered_indexes = set()
	# 随机的挑选num_to_predict个需要预测的Token
	# 因为cand_indexes打散过，因此顺序的取就行
	for index in cand_indexes:
		# 够了
		if len(masked_lms) >= num_to_predict:
			break
		# 已经挑选过了？似乎没有必要判断，因为set会去重。	
		if index in covered_indexes:
			continue
		covered_indexes.add(index)
	
		masked_token = None
		# 80%的概率把它替换成[MASK]
		if rng.random() < 0.8:
			masked_token = "[MASK]"
		else:
			# 10%的概率保持不变 
			if rng.random() < 0.5:
				masked_token = tokens[index]
			# 10%的概率随机替换成词典里的一个词。 
			else:
				masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
		
		output_tokens[index] = masked_token
		
		masked_lms.append(masked_lm(index=index, label=tokens[index]))
	
	# 按照下标排序，保证是句子中出现的顺序。
	masked_lms = sorted(masked_lms, key=lambda x: x.index)
	
	masked_lm_positions = []
	masked_lm_labels = []
	for p in masked_lms:
		masked_lm_positions.append(p.index)
		masked_lm_labels.append(p.label)
	
	return (output_tokens, masked_lm_positions, masked_lm_labels)

loss

get_masked_lm_output

def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
			label_ids, label_weights):
	"""得到masked LM的loss和log概率"""
	# 只需要Mask位置的Token的输出。
	input_tensor = gather_indexes(input_tensor, positions)
	
	with tf.variable_scope("cls/predictions"):
		# 在输出之前再加一个非线性变换，这些参数只是用于训练，在Fine-Tuning的时候就不用了。
		with tf.variable_scope("transform"):
			input_tensor = tf.layers.dense(
					input_tensor,
					units=bert_config.hidden_size,
					activation=modeling.get_activation(bert_config.hidden_act),
					kernel_initializer=modeling.create_initializer(
						bert_config.initializer_range))
			input_tensor = modeling.layer_norm(input_tensor)
		
		# output_weights是复用输入的word Embedding，所以是传入的，
		# 这里再多加一个bias。
		output_bias = tf.get_variable(
				"output_bias",
				shape=[bert_config.vocab_size],
				initializer=tf.zeros_initializer())
		logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
		logits = tf.nn.bias_add(logits, output_bias)
		log_probs = tf.nn.log_softmax(logits, axis=-1)
		
		# label_ids的长度是20，表示最大的MASK的Token数
		# label_ids里存放的是MASK过的Token的id
		label_ids = tf.reshape(label_ids, [-1])
		label_weights = tf.reshape(label_weights, [-1])
		
		one_hot_labels = tf.one_hot(
			label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
		
		# 但是由于实际MASK的可能不到20，比如只MASK18，那么label_ids有2个0(padding)
		# 而label_weights=[1, 1, ...., 0, 0]，说明后面两个label_id是padding的，计算loss要去掉。
		per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
		numerator = tf.reduce_sum(label_weights * per_example_loss)
		denominator = tf.reduce_sum(label_weights) + 1e-5
		loss = numerator / denominator
	
	return (loss, per_example_loss, log_probs)

get_next_sentence_output

def get_next_sentence_output(bert_config, input_tensor, labels):
	"""预测下一个句子是否相关的loss和log概率"""
	
	# 简单的2分类，0表示真的下一个句子，1表示随机的。这个分类器的参数在实际的Fine-Tuning
	# 会丢弃掉。 
	with tf.variable_scope("cls/seq_relationship"):
		output_weights = tf.get_variable(
				"output_weights",
				shape=[2, bert_config.hidden_size],
				initializer=modeling.create_initializer(bert_config.initializer_range))
		output_bias = tf.get_variable(
				"output_bias", shape=[2], initializer=tf.zeros_initializer())
		
		logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
		logits = tf.nn.bias_add(logits, output_bias)
		log_probs = tf.nn.log_softmax(logits, axis=-1)
		labels = tf.reshape(labels, [-1])
		one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
		per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
		loss = tf.reduce_mean(per_example_loss)
		return (loss, per_example_loss, log_probs)

classfication

convert_single_example

def convert_single_example(ex_index, example, label_list, max_seq_length,
				tokenizer):
	"""把一个`InputExample`对象变成`InputFeatures`."""
	# label_map把label变成id，这个函数每个example都需要执行一次，其实是可以优化的。
	# 只需要在可以再外面执行一次传入即可。
	label_map = {}
	for (i, label) in enumerate(label_list):
		label_map[label] = i
	
	tokens_a = tokenizer.tokenize(example.text_a)
	tokens_b = None
	if example.text_b:
		tokens_b = tokenizer.tokenize(example.text_b)
	
	if tokens_b:
		# 如果有b，那么需要保留3个特殊Token[CLS], [SEP]和[SEP]
		# 如果两个序列加起来太长，就需要去掉一些。
		_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
	else:
		# 没有b则只需要保留[CLS]和[SEP]两个特殊字符
		# 如果Token太多，就直接截取掉后面的部分。
		if len(tokens_a) > max_seq_length - 2:
			tokens_a = tokens_a[0:(max_seq_length - 2)]
	
	# BERT的约定是：
	# (a) 对于两个序列：
	#  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
	#  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
	# (b) 对于一个序列：
	#  tokens:   [CLS] the dog is hairy . [SEP]
	#  type_ids: 0     0   0   0  0     0 0
	#
	# 这里"type_ids"用于区分一个Token是来自第一个还是第二个序列
	# 对于type=0和type=1，模型会学习出两个Embedding向量。
	# 虽然理论上这是不必要的，因为[SEP]隐式的确定了它们的边界。
	# 但是实际加上type后，模型能够更加容易的知道这个词属于那个序列。
	#
	# 对于分类任务，[CLS]对应的向量可以被看成 "sentence vector"
	# 注意：一定需要Fine-Tuning之后才有意义
	tokens = []
	segment_ids = []
	tokens.append("[CLS]")
	segment_ids.append(0)
	for token in tokens_a:
		tokens.append(token)
		segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
	
	if tokens_b:
		for token in tokens_b:
			tokens.append(token)
			segment_ids.append(1)
		tokens.append("[SEP]")
		segment_ids.append(1)
	
	input_ids = tokenizer.convert_tokens_to_ids(tokens)
	
	# mask是1表示是"真正"的Token，0则是Padding出来的。在后面的Attention时会通过tricky的技巧让
	# 模型不能attend to这些padding出来的Token上。
	input_mask = [1] * len(input_ids)
	
	# padding使得序列长度正好等于max_seq_length
	while len(input_ids) < max_seq_length:
		input_ids.append(0)
		input_mask.append(0)
		segment_ids.append(0)
 
	label_id = label_map[example.label]
	
	feature = InputFeatures(
		input_ids=input_ids,
		input_mask=input_mask,
		segment_ids=segment_ids,
		label_id=label_id)
	return feature

file_based_convert_examples_to_features

file_based_convert_examples_to_features函数遍历每一个example(InputExample类的对象)。然后使用convert_single_example函数把每个InputExample对象变成InputFeature。InputFeature就是一个存放特征的对象，它包括input_ids、input_mask、segment_ids和label_id，这4个属性除了label_id是一个int之外，其它都是int的列表，因此使用create_int_feature函数把它变成tf.train.Feature，而label_id需要构造一个只有一个元素的列表，最后构造tf.train.Example对象，然后写到TFRecord文件里。后面Estimator的input_fn会用到它。

# 首先通过file_based_convert_examples_to_features函数把输入的tsv文件变成TFRecord文件。便于Tensorflow处理。

def file_based_convert_examples_to_features(
        examples, label_list, max_seq_length, tokenizer, output_file):
    """Convert a set of `InputExample`s to a TFRecord file."""

    writer = tf.python_io.TFRecordWriter(output_file)

    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

        feature = convert_single_example(ex_index, example, label_list,
                                         max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_ids"] = create_int_feature([feature.label_id])
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()

file_based_input_fn_builder

这个函数返回一个函数input_fn。这个input_fn函数首先从文件得到TFRecordDataset，然后根据是否训练来shuffle和重复读取。然后用applay函数对每一个TFRecord进行map_and_batch，调用_decode_record函数对record进行parsing。从而把TFRecord的一条Record变成tf.Example对象，这个对象包括了input_ids等4个用于训练的Tensor。

# 构造Estimator API的input_fn
def file_based_input_fn_builder(input_file, seq_length, is_training,
			drop_remainder):
 
	name_to_features = {
		"input_ids": tf.FixedLenFeature([seq_length], tf.int64),
		"input_mask": tf.FixedLenFeature([seq_length], tf.int64),
		"segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
		"label_ids": tf.FixedLenFeature([], tf.int64),
	}
	
	def _decode_record(record, name_to_features):
		# 把record decode成TensorFlow example.
		example = tf.parse_single_example(record, name_to_features)
		
		# tf.Example只支持tf.int64，但是TPU只支持tf.int32.
		# 因此我们把所有的int64变成int32.
		for name in list(example.keys()):
			t = example[name]
			if t.dtype == tf.int64:
				t = tf.to_int32(t)
			example[name] = t
		
		return example
	
	def input_fn(params): 
		batch_size = params["batch_size"]
		
		# 对于训练来说，我们会重复的读取和shuffling 
		# 对于验证和测试，我们不需要shuffling和并行读取。
		d = tf.data.TFRecordDataset(input_file)
		if is_training:
			d = d.repeat()
			d = d.shuffle(buffer_size=100)
		
		d = d.apply(
				tf.contrib.data.map_and_batch(
					lambda record: _decode_record(record, name_to_features),
					batch_size=batch_size,
					drop_remainder=drop_remainder))
		
		return d
	
	return input_fn

model_fn_builder

用于构造Estimator使用的model_fn

def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
				num_train_steps, num_warmup_steps, use_tpu,
				use_one_hot_embeddings): 
	# 注意：在model_fn的设计里，features表示输入(特征)，而labels表示输出
	# 但是这里的实现有点不好，把label也放到了features里。
	def model_fn(features, labels, mode, params): 
		input_ids = features["input_ids"]
		input_mask = features["input_mask"]
		segment_ids = features["segment_ids"]
		label_ids = features["label_ids"]
		
		is_training = (mode == tf.estimator.ModeKeys.TRAIN)
		
		# 创建Transformer模型，这是最主要的代码。
		(total_loss, per_example_loss, logits, probabilities) = create_model(
			bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
			num_labels, use_one_hot_embeddings)
		
		tvars = tf.trainable_variables()
		
		# 从checkpoint恢复参数
		if init_checkpoint: 
			(assignment_map, initialized_variable_names) = 	
				modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
			
			tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
		 
		
		output_spec = None
		# 构造训练的spec
		if mode == tf.estimator.ModeKeys.TRAIN:
			train_op = optimization.create_optimizer(total_loss, learning_rate, 
							num_train_steps, num_warmup_steps, use_tpu)
			
			output_spec = tf.contrib.tpu.TPUEstimatorSpec(
					mode=mode,
					loss=total_loss,
					train_op=train_op,
					scaffold_fn=scaffold_fn)
					
		# 构造eval的spec
		elif mode == tf.estimator.ModeKeys.EVAL:	
			def metric_fn(per_example_loss, label_ids, logits):
				predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
				accuracy = tf.metrics.accuracy(label_ids, predictions)
				loss = tf.metrics.mean(per_example_loss)
				return {
					"eval_accuracy": accuracy,
					"eval_loss": loss,
				}
			
			eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
			output_spec = tf.contrib.tpu.TPUEstimatorSpec(
				mode=mode,
				loss=total_loss,
				eval_metrics=eval_metrics,
				scaffold_fn=scaffold_fn)
		
		# 预测的spec
		else:
			output_spec = tf.contrib.tpu.TPUEstimatorSpec(
				mode=mode,
				predictions=probabilities,
				scaffold_fn=scaffold_fn)
		return output_spec
	
	return model_fn

create_model

调用modeling.BertModel得到BERT模型，然后使用它的get_pooled_output方法得到[CLS]最后一层的输出，这是一个768(默认参数下)的向量，然后就是常规的接一个全连接层得到logits，然后softmax得到概率，之后就可以根据真实的分类标签计算loss。

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
					labels, num_labels, use_one_hot_embeddings): 
	model = modeling.BertModel(
			config=bert_config,
			is_training=is_training,
			input_ids=input_ids,
			input_mask=input_mask,
			token_type_ids=segment_ids,
			use_one_hot_embeddings=use_one_hot_embeddings)
	
	# 在这里，我们是用来做分类，因此我们只需要得到[CLS]最后一层的输出。 	# 如果需要做序列标注，那么可以使用model.get_sequence_output() 	# 默认参数下它返回的output_layer是[8, 768] 	output_layer = model.get_pooled_output()
	
	# 默认是768 	hidden_size = output_layer.shape[-1].value
	
	
	output_weights = tf.get_variable(
		"output_weights", [num_labels, hidden_size],
		initializer=tf.truncated_normal_initializer(stddev=0.02))
	
	output_bias = tf.get_variable(
		"output_bias", [num_labels], initializer=tf.zeros_initializer())
	
	with tf.variable_scope("loss"):
		if is_training:
			# 0.1的概率会dropout 			output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
			
		# 对[CLS]输出的768的向量再做一个线性变换，输出为label的个数。得到logits 		logits = tf.matmul(output_layer, output_weights, transpose_b=True)
		logits = tf.nn.bias_add(logits, output_bias)
		probabilities = tf.nn.softmax(logits, axis=-1)
		log_probs = tf.nn.log_softmax(logits, axis=-1)
		
		one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
		
		per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
		loss = tf.reduce_mean(per_example_loss)
	
	return (loss, per_example_loss, logits, probabilities)

squad

资料

参考

李理的博客

准备