-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathattenion_block.py
130 lines (106 loc) · 5.34 KB
/
attenion_block.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import math
import tensorflow as tf
def split_heads(x, hidden_size, num_heads):
"""Split x into different heads, and transpose the resulting value.
The tensor is transposed to insure the inner dimensions hold the correct
values during the matrix multiplication.
Args:
x: A tensor with shape [batch_size, length, hidden_size]
hidden_size: Int,The unit size of hidden layer
num_heads: Int,The number of heads to split
Returns:
A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
"""
with tf.name_scope("split_heads"):
batch_size = tf.shape(x)[0]
length = tf.shape(x)[1]
# Calculate depth of last dimension after it has been split.
depth = (hidden_size // num_heads)
# Split the last dimension
x = tf.reshape(x, [batch_size, length, num_heads, depth])
# Transpose the result
return tf.transpose(x, [0, 2, 1, 3])
def combine_heads(x, hidden_size):
"""Combine tensor that has been split.
Args:
x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
hidden_size: Int,The unit size of hidden layer
Returns:
A tensor with shape [batch_size, length, hidden_size]
"""
with tf.name_scope("combine_heads"):
batch_size = tf.shape(x)[0]
length = tf.shape(x)[2]
x = tf.transpose(x, [0, 2, 1, 3]) # --> [batch, length, num_heads, depth]
return tf.reshape(x, [batch_size, length, hidden_size])
def multi_head_attention(x, y, hidden_size, num_heads, attention_dropout=0.3, scope='multi_head_attention',
training=False):
"""Apply attention mechanism to x and y.
Args:
x: a tensor with shape [batch_size, length_x, hidden_size]
y: a tensor with shape [batch_size, length_y, hidden_size]
hidden_size:Int,the size of units of layer
num_heads: Int,the number of heads
attention_dropout:Float,The probability to set the units to zeros when training
scope: (optional),the scope's name the this block
training: bool,true for train,false for inference
Returns:
Attention layer output with shape [batch_size, length_x, hidden_size]
"""
if hidden_size % num_heads != 0:
raise ValueError("Hidden size must be evenly divisible by the number of "
"heads.")
with tf.name_scope(name='multi_head_attention'):
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
# Linearly project the query (q), key (k) and value (v) using different
# learned projections. This is in preparation of splitting them into
# multiple heads. Multi-head attention uses multiple queries, keys, and
# values rather than regular attention (which uses a single q, k, v).
q = tf.layers.dense(x, hidden_size, use_bias=False, name="q")
k = tf.layers.dense(y, hidden_size, use_bias=False, name="k")
v = tf.layers.dense(y, hidden_size, use_bias=False, name="v")
# Split q, k, v into heads.
q = split_heads(q, hidden_size, num_heads)
k = split_heads(k, hidden_size, num_heads)
v = split_heads(v, hidden_size, num_heads)
# Scale q to prevent the dot product between q and k from growing too large.
depth = (hidden_size // num_heads)
q *= depth ** -0.5
# Calculate dot product attention
logits = tf.matmul(q, k, transpose_b=True)
weights = tf.nn.softmax(logits, name="attention_weights")
if training:
weights = tf.nn.dropout(weights, 1.0 - attention_dropout)
attention_output = tf.matmul(weights, v)
# Recombine heads --> [batch_size, length, hidden_size]
attention_output = combine_heads(attention_output, hidden_size)
# Run the combined outputs through another linear projection layer.
attention_output = tf.layers.dense(attention_output, hidden_size, use_bias=False, name="output_transform")
return attention_output
def get_position_encoding(
length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
"""Return positional encoding.
Calculates the position encoding as a mix of sine and cosine functions with
geometrically increasing wavelengths.
Defined and formulized in Attention is All You Need, section 3.5.
Args:
length: Sequence length.
hidden_size: Size of the
min_timescale: Minimum scale that will be applied at each position
max_timescale: Maximum scale that will be applied at each position
Returns:
Tensor with shape [length, hidden_size]
"""
# We compute the positional encoding in float32 even if the model uses
# float16, as many of the ops used, like log and exp, are numerically unstable
# in float16.
position = tf.cast(tf.range(length), tf.float32)
num_timescales = hidden_size // 2
log_timescale_increment = (
math.log(float(max_timescale) / float(min_timescale)) /
(tf.cast(num_timescales, tf.float32) - 1))
inv_timescales = min_timescale * tf.exp(
tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
return signal