from keras.layers import Embedding

# Embedding 층은 적어도 두 개의 매개변수를 사용.
# 가능한 토큰의 개수(여기서는 1,000으로 단어 인덱스 최댓값 + 1입니다)와 
# 임베딩 차원(여기서는 64)입니다
embedding_layer = Embedding(1000, 64)


from keras.datasets import imdb
from keras import preprocessing


# 특성으로 사용할 단어의 수
max_features = 10000

# 정수 리스트로 데이터를 로드합니다.
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)

<__array_function__ internals>:5: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
c:\users\toto\anaconda3\envs\tf20\lib\site-packages\tensorflow\python\keras\datasets\imdb.py:159: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
c:\users\toto\anaconda3\envs\tf20\lib\site-packages\tensorflow\python\keras\datasets\imdb.py:160: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


X_train.shape, y_train.shape, X_test.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))


print(X_train[0])
print(y_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1


#  리뷰의 길이와 10개 단어(인덱스) 보기
len(X_train[0]), X_train[0][0:10]   # 단어가 218개 단어로 구성

(218, [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65])


# 리스트를 (samples, maxlen) 크기의 2D 정수 텐서로 변환합니다.
maxlen = 50

X_train_n = preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
X_test_n = preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)


print("처음 : " , X_train.shape, X_test.shape)
print("변경 후 : ", X_train_n.shape, X_test_n.shape)

처음 :  (25000,) (25000,)
변경 후 :  (25000, 50) (25000, 50)


from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding


model = Sequential()

# 나중에 임베딩된 입력을 Flatten 층에서 펼치기 위해 Embedding 층에 
# input_length를 지정
# Embedding 층의 입력은 2D (samples, maxlen) 이다.
model.add(Embedding(10000, 8, input_length=maxlen))

# Embedding 층의 출력 크기는 (samples, maxlen, 8)가 됩니다.


# 3D 임베딩 텐서를 (samples, maxlen * 8) 크기의 2D 텐서로 펼칩니다.
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# 분류기를 추가합니다.
model.compile(optimizer='rmsprop', 
              loss='binary_crossentropy', 
              metrics=['acc'])

model.summary()

history = model.fit(X_train_n, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_3 (Embedding)      (None, 50, 8)             80000     
_________________________________________________________________
flatten (Flatten)            (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 401       
=================================================================
Total params: 80,401
Trainable params: 80,401
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
625/625 [==============================] - 8s 10ms/step - loss: 0.6797 - acc: 0.5842 - val_loss: 0.5534 - val_acc: 0.7622
Epoch 2/10
625/625 [==============================] - 2s 3ms/step - loss: 0.4908 - acc: 0.7907 - val_loss: 0.4336 - val_acc: 0.7960
Epoch 3/10
625/625 [==============================] - 2s 3ms/step - loss: 0.3752 - acc: 0.8404 - val_loss: 0.4054 - val_acc: 0.8094
Epoch 4/10
625/625 [==============================] - 2s 3ms/step - loss: 0.3268 - acc: 0.8595 - val_loss: 0.3994 - val_acc: 0.8122
Epoch 5/10
625/625 [==============================] - 2s 3ms/step - loss: 0.2982 - acc: 0.8766 - val_loss: 0.3994 - val_acc: 0.8146
Epoch 6/10
625/625 [==============================] - 2s 3ms/step - loss: 0.2614 - acc: 0.8923 - val_loss: 0.4052 - val_acc: 0.8150
Epoch 7/10
625/625 [==============================] - 2s 3ms/step - loss: 0.2474 - acc: 0.9027 - val_loss: 0.4118 - val_acc: 0.8194
Epoch 8/10
625/625 [==============================] - 2s 3ms/step - loss: 0.2257 - acc: 0.9141 - val_loss: 0.4213 - val_acc: 0.8122
Epoch 9/10
625/625 [==============================] - 2s 3ms/step - loss: 0.2101 - acc: 0.9212 - val_loss: 0.4294 - val_acc: 0.8148
Epoch 10/10
625/625 [==============================] - 2s 3ms/step - loss: 0.1940 - acc: 0.9273 - val_loss: 0.4405 - val_acc: 0.8106

단어 임베딩 사용하기¶

01 Onehot vs 단어 임베딩 비교¶

02 단어 임베딩 만드는 두가지 방법¶

03 Embedding 층을 사용하여 단어 임베딩 학습하기¶

그러면 어떻게 해야 할까?¶

문제는 사람의 언어를 완전히 매핑시킬 수 있는 이상적인 단어 임베딩 공간을 만드는 것이다. 이런 공간이 있을까?¶

케라스를 이용하여 이를 구현해 보자¶

Embedding층의 입력¶

Embedding층의 출력¶

Embedding 층의 객체¶

04 IMDB 데이터를 이용한 Embedding 실습¶

다양한 길이가 리뷰가 있을 것이다.¶

왜 1D 텐서를 2D텐서로 변경하는가?¶

토큰을 벡터로 변환하는 방법¶