Import Package¶

In [1]:
import os
import re
import string
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf

from nltk.corpus import stopwords

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam, Adamax, SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, SpatialDropout1D
from tensorflow.keras import regularizers

import random as rn
tf.random.set_seed(123)
np.random.RandomState(123)
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(123)
rn.seed(123)
In [3]:
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/iwanxone/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Out[3]:
True

Load Data¶

In [4]:
pathh = r"./Dataset/indosum datasets"
df = pd.read_csv(pathh, header=None, delimiter='\t')
df.columns = ['category','source', 'article']
In [5]:
df.head()
Out[5]:
category source article
0 teknologi kumparan Uber pada hari Jumat mengatakan akan menguak d...
1 teknologi dailysocial.id Menyusul jejak NES Classic Edition , SNES Clas...
2 teknologi dailysocial.id MDI Ventures , perusahaan modal ventura yang d...
3 teknologi kumparan Mazda masih menutup rapat informasi soal sport...
4 teknologi dailysocial.id Sampai di akhir tahun 2017 ini , frasa “ print...
In [5]:
df.isna().sum()
Out[5]:
category    0
source      0
article     0
dtype: int64
In [6]:
df['category'].value_counts()
Out[6]:
teknologi      2000
tajuk utama    2000
showbiz        2000
olahraga       2000
hiburan        2000
Name: category, dtype: int64
In [7]:
df.groupby(df['category']).count()
Out[7]:
source article
category
hiburan 2000 2000
olahraga 2000 2000
showbiz 2000 2000
tajuk utama 2000 2000
teknologi 2000 2000
In [8]:
import seaborn as sns
import matplotlib.pyplot as plt

import warnings as wrn
wrn.filterwarnings('ignore', category = DeprecationWarning) 
wrn.filterwarnings('ignore', category = FutureWarning) 
wrn.filterwarnings('ignore', category = UserWarning)
wrn.filterwarnings('ignore', category = RuntimeWarning)
In [9]:
plt.title('Category Count')
sns.countplot(df['category'])
Out[9]:
<AxesSubplot:title={'center':'Category Count'}, xlabel='category', ylabel='count'>

Pre-Processing Data¶

In [6]:
def filtered(text):
    """
    Buat teks menjadi huruf kecil, hapus teks dalam tanda kurung siku, hapus tautan,
    hapus tanda baca dan hapus kata-kata yang mengandung angka.
    """
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
In [7]:
def stopwordsRemoval(text):
    """
    Fungsi ini digunakan untuk mengfilter kata pada corpus
    dan mengembalikan kata-kata penting dari tiap-tiap dokumen.
    """
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    tokenizer = tokenizer.tokenize(text)
    listStopwords = set(stopwords.words('indonesian')+stopwords.words('english'))
    filtered = [s for s in tokenizer if s not in listStopwords]
    textFiltered = ' '.join(filtered)
    return textFiltered
In [12]:
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()
In [8]:
df['text_clean'] = df['article'].apply(lambda x: filtered(x))
In [9]:
df['stopword_removal'] = df['text_clean'].apply(lambda x: stopwordsRemoval(x))
In [15]:
# df['stemming'] = df['stopword_removal'].apply(lambda x: stemmer.stem(x))
In [10]:
df['Number_of_words'] = df['stopword_removal'].apply(lambda x:len(str(x).split()))
In [11]:
df.loc[0:5, ['stopword_removal','Number_of_words']]
Out[11]:
stopword_removal Number_of_words
0 uber jumat menguak data perjalanan paris publi... 142
1 menyusul jejak nes classic edition snes classi... 166
2 mdi ventures perusahaan modal ventura didukung... 291
3 mazda menutup rapat informasi sportscar anyar ... 131
4 frasa printer mencetak video terdengar mustahi... 140
5 dxo one aksesori mungil berfungsi mengubah iph... 156
In [12]:
df['Number_of_words'].describe()
Out[12]:
count    10000.000000
mean       170.040400
std         68.460311
min         20.000000
25%        124.000000
50%        158.000000
75%        205.000000
max        662.000000
Name: Number_of_words, dtype: float64

Membagi Data Menjadi Data Training & Data Testing¶

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['stopword_removal'],
                                                    df['category'],
                                                    test_size=0.1,
                                                    random_state=25)
In [14]:
X_train
Out[14]:
9189    jakarta cnn indonesia suzzanna ikon film horor...
6354    jakarta cnn indonesia kapten persib bandung at...
2217    jakarta cnn indonesia tujuh siswa setingkat sm...
5663    jakarta cnn indonesia musik punk mati grup leg...
5102    korea selatan bts meraih sukses industri musik...
                              ...                        
1175    true money indonesia salah perusahaan uang ele...
8447    hidrogen peroksida alias cairan asam lemah ber...
2934    jakarta cnn indonesia polda metro jaya berkoor...
6618    juaranet laga digelar seri indonesian basketba...
8510    coca cola menegak segelas coca cola dingin sen...
Name: stopword_removal, Length: 9000, dtype: object
In [15]:
X_test.index[:25]
Out[15]:
Int64Index([3555, 4078, 8445, 5939, 5583, 1656, 5550, 1736, 6297, 6364, 6341,
            2802, 8579, 2351,  877, 5844,  129, 7234, 5704, 6047,   79, 2969,
            5948, 8906, 9914],
           dtype='int64')

Representasi Text Ke Integer¶

In [16]:
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 200
tokenizer = Tokenizer(oov_token='UNK', lower=True)
tokenizer.fit_on_texts(X_train)

Representasi Text Ke Bentuk Sequences & Memberikan Padding¶

In [19]:
train_seq = tokenizer.texts_to_sequences(X_train)
train_pad = pad_sequences(train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

test_seq = tokenizer.texts_to_sequences(X_test)
test_pad = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
In [20]:
train_pad
Out[20]:
array([[ 4906,  2713,  7172, ...,  1240,  1078,  1078],
       [    3,    12,     2, ...,     0,     0,     0],
       [    3,    12,     2, ...,     0,     0,     0],
       ...,
       [    3,    12,     2, ...,     0,     0,     0],
       [  481,    33,   188, ...,     0,     0,     0],
       [ 7671,  6887, 38458, ...,     0,     0,     0]], dtype=int32)

Mengubah Laber Menjadi Integer (LabelEncoder)¶

In [21]:
encoder = LabelEncoder()
encoder.fit(y_train)
train_encod = encoder.transform(y_train)
test_encod = encoder.transform(y_test)
In [22]:
num_classes = np.max(train_encod) + 1
train_encod = to_categorical(train_encod, num_classes)
test_encod = to_categorical(test_encod, num_classes)
In [23]:
print(train_encod)
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]]

Shape Data¶

In [24]:
print('X_train', train_pad.shape)
print('X_test', test_pad.shape)
print('y_train', train_encod.shape)
print('y_test', test_encod.shape)
X_train (9000, 200)
X_test (1000, 200)
y_train (9000, 5)
y_test (1000, 5)

Skenarion Pelatihan Model LSTM¶

Model 1¶

In [25]:
MODEL2 = {'units':128, 'dropout':0.5, 'Lr':0.01, 'batch-size':128, 'epoch':50}
In [28]:
MAX_NB_WORDS = len(tokenizer.word_index)+1

model2 = Sequential([
    Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
    SpatialDropout1D(MODEL2['dropout']),
    LSTM(MODEL2['units'], dropout=MODEL2['dropout'], recurrent_dropout=MODEL2['dropout'], activation='tanh'),
    Dropout(MODEL2['dropout']),
    Dense(num_classes, activation='softmax')
])

model2.summary()
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_2 (Embedding)     (None, 200, 200)          14488600  
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 200, 200)         0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 128)               168448    
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 5)                 645       
                                                                 
=================================================================
Total params: 14,657,693
Trainable params: 14,657,693
Non-trainable params: 0
_________________________________________________________________
In [29]:
model2.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=MODEL2['Lr']),
              metrics=['accuracy']
              )
In [30]:
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history2 = model2.fit(train_pad, train_encod,
                    batch_size=MODEL2['batch-size'],
                    epochs=MODEL2['epoch'],
                    validation_split=0.1,
                    verbose=1,
                    callbacks=[esCallback]
                    )
history2
Epoch 1/50
64/64 [==============================] - 266s 4s/step - loss: 1.4531 - accuracy: 0.3305 - val_loss: 1.3656 - val_accuracy: 0.3711
Epoch 2/50
64/64 [==============================] - 313s 5s/step - loss: 1.4909 - accuracy: 0.3067 - val_loss: 1.4505 - val_accuracy: 0.3422
Epoch 3/50
64/64 [==============================] - 293s 5s/step - loss: 1.3740 - accuracy: 0.3693 - val_loss: 1.2777 - val_accuracy: 0.4267
Epoch 4/50
64/64 [==============================] - 257s 4s/step - loss: 1.2727 - accuracy: 0.4252 - val_loss: 1.2258 - val_accuracy: 0.4689
Epoch 5/50
64/64 [==============================] - 246s 4s/step - loss: 1.1278 - accuracy: 0.5100 - val_loss: 0.9642 - val_accuracy: 0.5800
Epoch 6/50
64/64 [==============================] - 268s 4s/step - loss: 0.9500 - accuracy: 0.5977 - val_loss: 0.8386 - val_accuracy: 0.6178
Epoch 7/50
64/64 [==============================] - 247s 4s/step - loss: 0.8544 - accuracy: 0.6326 - val_loss: 0.8439 - val_accuracy: 0.6667
Epoch 8/50
64/64 [==============================] - 231s 4s/step - loss: 0.7210 - accuracy: 0.6864 - val_loss: 0.7591 - val_accuracy: 0.6933
Epoch 9/50
64/64 [==============================] - 244s 4s/step - loss: 0.5938 - accuracy: 0.7498 - val_loss: 0.7377 - val_accuracy: 0.7022
Epoch 10/50
64/64 [==============================] - 201s 3s/step - loss: 0.4907 - accuracy: 0.7986 - val_loss: 0.6381 - val_accuracy: 0.7656
Epoch 11/50
64/64 [==============================] - 241s 4s/step - loss: 0.4244 - accuracy: 0.8412 - val_loss: 0.7605 - val_accuracy: 0.7389
Epoch 12/50
64/64 [==============================] - 243s 4s/step - loss: 0.3431 - accuracy: 0.8774 - val_loss: 0.5298 - val_accuracy: 0.8389
Epoch 13/50
64/64 [==============================] - 216s 3s/step - loss: 0.2638 - accuracy: 0.9102 - val_loss: 0.5396 - val_accuracy: 0.8500
Epoch 14/50
64/64 [==============================] - 261s 4s/step - loss: 0.2064 - accuracy: 0.9386 - val_loss: 0.5167 - val_accuracy: 0.8644
Epoch 15/50
64/64 [==============================] - 249s 4s/step - loss: 0.1807 - accuracy: 0.9500 - val_loss: 0.6138 - val_accuracy: 0.8367
Epoch 16/50
64/64 [==============================] - 349s 5s/step - loss: 0.1636 - accuracy: 0.9568 - val_loss: 0.5345 - val_accuracy: 0.8578
Epoch 17/50
64/64 [==============================] - 318s 5s/step - loss: 0.1423 - accuracy: 0.9625 - val_loss: 0.6736 - val_accuracy: 0.8611
Out[30]:
<keras.callbacks.History at 0x7f609188f910>
In [31]:
score2 = model2.evaluate(test_pad, test_encod,
                       batch_size=MODEL2['batch-size'], verbose=2)
print('Test loss:', score2[0])
print('Test accuracy:', score2[1])
8/8 - 16s - loss: 0.6910 - accuracy: 0.8660 - 16s/epoch - 2s/step
Test loss: 0.6910145878791809
Test accuracy: 0.8659999966621399
In [30]:
import matplotlib.pyplot as plt
acc = history2.history['accuracy']
val_acc = history2.history['val_accuracy']
loss = history2.history['loss']
val_loss = history2.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()
In [32]:
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
In [31]:
text_labels = encoder.classes_
y_pred = model2.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: 
               precision    recall  f1-score   support

     hiburan       0.80      0.77      0.79       202
    olahraga       0.98      0.99      0.99       191
     showbiz       0.87      0.87      0.87       193
 tajuk utama       0.84      0.83      0.83       187
   teknologi       0.92      0.96      0.94       227

    accuracy                           0.88      1000
   macro avg       0.88      0.88      0.88      1000
weighted avg       0.88      0.88      0.88      1000

In [33]:
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
Out[33]:
(<Figure size 432x432 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x214b350a7b8>)

Model 2¶

In [26]:
MODEL4 = {'units':128, 'dropout':0.5, 'Lr':0.001, 'batch-size':128, 'epoch':50}
MODEL4
Out[26]:
{'units': 128, 'dropout': 0.5, 'Lr': 0.001, 'batch-size': 128, 'epoch': 50}
In [27]:
MAX_NB_WORDS = len(tokenizer.word_index)+1

model4 = Sequential([
    Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
    SpatialDropout1D(MODEL4['dropout']),
    LSTM(MODEL4['units'], dropout=MODEL4['dropout'], recurrent_dropout=MODEL4['dropout'], activation='tanh'),
    Dropout(MODEL4['dropout']),
    Dense(num_classes, activation='softmax')
])

model4.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 200, 200)          14488600  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 200)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               168448    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 645       
=================================================================
Total params: 14,657,693
Trainable params: 14,657,693
Non-trainable params: 0
_________________________________________________________________
In [28]:
model4.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=MODEL4['Lr']),
              metrics=['accuracy']
              )
In [29]:
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history4 = model4.fit(train_pad, train_encod,
                    batch_size=MODEL4['batch-size'],
                    epochs=MODEL4['epoch'],
                    validation_split=0.1,
                    verbose=1,
                    callbacks=[esCallback]
                    )
history4
Train on 8100 samples, validate on 900 samples
Epoch 1/50
8100/8100 [==============================] - 221s 27ms/sample - loss: 1.5289 - accuracy: 0.2894 - val_loss: 1.2912 - val_accuracy: 0.3622
Epoch 2/50
8100/8100 [==============================] - 175s 22ms/sample - loss: 1.3522 - accuracy: 0.3723 - val_loss: 1.2809 - val_accuracy: 0.3722
Epoch 3/50
8100/8100 [==============================] - 186s 23ms/sample - loss: 1.3059 - accuracy: 0.3868 - val_loss: 1.2999 - val_accuracy: 0.3933
Epoch 4/50
8100/8100 [==============================] - 217s 27ms/sample - loss: 1.3473 - accuracy: 0.3936 - val_loss: 1.2758 - val_accuracy: 0.4189
Epoch 5/50
8100/8100 [==============================] - 230s 28ms/sample - loss: 1.2571 - accuracy: 0.4374 - val_loss: 1.1798 - val_accuracy: 0.4544
Epoch 6/50
8100/8100 [==============================] - 218s 27ms/sample - loss: 1.1938 - accuracy: 0.4779 - val_loss: 1.0963 - val_accuracy: 0.4922
Epoch 7/50
8100/8100 [==============================] - 202s 25ms/sample - loss: 1.0084 - accuracy: 0.5653 - val_loss: 0.8115 - val_accuracy: 0.6322
Epoch 8/50
8100/8100 [==============================] - 203s 25ms/sample - loss: 0.9038 - accuracy: 0.6181 - val_loss: 0.7902 - val_accuracy: 0.6444
Epoch 9/50
8100/8100 [==============================] - 207s 26ms/sample - loss: 0.8581 - accuracy: 0.6364 - val_loss: 0.7853 - val_accuracy: 0.6511
Epoch 10/50
8100/8100 [==============================] - 211s 26ms/sample - loss: 0.8249 - accuracy: 0.6532 - val_loss: 0.7714 - val_accuracy: 0.6467
Epoch 11/50
8100/8100 [==============================] - 196s 24ms/sample - loss: 0.7869 - accuracy: 0.6623 - val_loss: 0.7675 - val_accuracy: 0.6511
Epoch 12/50
8100/8100 [==============================] - 194s 24ms/sample - loss: 0.7846 - accuracy: 0.6602 - val_loss: 0.7538 - val_accuracy: 0.6533
Epoch 13/50
8100/8100 [==============================] - 194s 24ms/sample - loss: 0.7492 - accuracy: 0.6748 - val_loss: 0.7285 - val_accuracy: 0.6667
Epoch 14/50
8100/8100 [==============================] - 195s 24ms/sample - loss: 0.7190 - accuracy: 0.6917 - val_loss: 0.7093 - val_accuracy: 0.6833
Epoch 15/50
8100/8100 [==============================] - 194s 24ms/sample - loss: 0.6722 - accuracy: 0.7312 - val_loss: 0.5539 - val_accuracy: 0.8056
Epoch 16/50
8100/8100 [==============================] - 195s 24ms/sample - loss: 0.6188 - accuracy: 0.7607 - val_loss: 0.5385 - val_accuracy: 0.8300
Epoch 17/50
8100/8100 [==============================] - 198s 24ms/sample - loss: 0.5956 - accuracy: 0.7780 - val_loss: 0.5009 - val_accuracy: 0.8300
Epoch 18/50
8100/8100 [==============================] - 195s 24ms/sample - loss: 0.5483 - accuracy: 0.8046 - val_loss: 0.4034 - val_accuracy: 0.8911
Epoch 19/50
8100/8100 [==============================] - 199s 25ms/sample - loss: 0.4991 - accuracy: 0.8460 - val_loss: 0.4170 - val_accuracy: 0.8944
Epoch 20/50
8100/8100 [==============================] - 214s 26ms/sample - loss: 0.4419 - accuracy: 0.8738 - val_loss: 0.3828 - val_accuracy: 0.9011
Epoch 21/50
8100/8100 [==============================] - 212s 26ms/sample - loss: 0.4289 - accuracy: 0.8784 - val_loss: 0.3702 - val_accuracy: 0.9044
Epoch 22/50
8100/8100 [==============================] - 188s 23ms/sample - loss: 0.3966 - accuracy: 0.8879 - val_loss: 0.3891 - val_accuracy: 0.9022
Epoch 23/50
8100/8100 [==============================] - 191s 24ms/sample - loss: 0.3494 - accuracy: 0.9056 - val_loss: 0.3387 - val_accuracy: 0.9100
Epoch 24/50
8100/8100 [==============================] - 192s 24ms/sample - loss: 0.3348 - accuracy: 0.9125 - val_loss: 0.3431 - val_accuracy: 0.9067
Epoch 25/50
8100/8100 [==============================] - 192s 24ms/sample - loss: 0.3136 - accuracy: 0.9173 - val_loss: 0.3425 - val_accuracy: 0.9133
Epoch 26/50
8100/8100 [==============================] - 189s 23ms/sample - loss: 0.2986 - accuracy: 0.9226 - val_loss: 0.3509 - val_accuracy: 0.9200
Out[29]:
<tensorflow.python.keras.callbacks.History at 0x1bb5a461b38>
In [30]:
score4 = model4.evaluate(test_pad, test_encod,
                       batch_size=MODEL4['batch-size'], verbose=2)
print('Test loss:', score4[0])
print('Test accuracy:', score4[1])
1000/1 - 5s - loss: 0.4444 - accuracy: 0.9070
Test loss: 0.3792504758834839
Test accuracy: 0.907
In [31]:
import matplotlib.pyplot as plt
acc = history4.history['accuracy']
val_acc = history4.history['val_accuracy']
loss = history4.history['loss']
val_loss = history4.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()
In [1]:
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
In [33]:
text_labels = encoder.classes_
y_pred = model4.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: 
               precision    recall  f1-score   support

     hiburan       0.86      0.80      0.83       202
    olahraga       0.97      0.98      0.98       191
     showbiz       0.87      0.93      0.90       193
 tajuk utama       0.90      0.87      0.88       187
   teknologi       0.93      0.96      0.94       227

    accuracy                           0.91      1000
   macro avg       0.91      0.91      0.91      1000
weighted avg       0.91      0.91      0.91      1000

In [34]:
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
Out[34]:
(<Figure size 432x432 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1bb0241a080>)
In [26]:
lstm = load_model('D:\Basic Natural Languange Processing\Model\LSTM_model91.h5')
In [27]:
# Memulai melakukan prediksi hasil terhadap dataset yang ada
text_labels = encoder.classes_ 
for i in range(1):
    prediction = lstm.predict(np.array([test_pad[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(X_test.iloc[i][:50], "...")
    print('Label awal: ' +  y_test.iloc[i])
    print("Label hasil prediksi: " + predicted_label)
    print([round(i*100, 2) for i in prediction[0]] , "\n")
jakarta news presiden joko widodo menerima kunjung ...
Label awal: tajuk utama
Label hasil prediksi: tajuk utama
[1.41, 1.1, 0.61, 96.3, 0.57] 

Model 3¶

In [28]:
MODEL2 = {'units':128, 'dropout':0.5, 'Lr':0.01, 'batch-size':128, 'epoch':50}
In [29]:
MAX_NB_WORDS = len(tokenizer.word_index)+1

model2 = Sequential([
    Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
    SpatialDropout1D(MODEL2['dropout']),
    LSTM(MODEL2['units'], dropout=MODEL2['dropout'], recurrent_dropout=MODEL2['dropout'], activation='tanh'),
    Dropout(MODEL2['dropout']),
    Dense(num_classes, activation='softmax')
])

model2.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 200, 200)          11280200  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 200)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               168448    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 645       
=================================================================
Total params: 11,449,293
Trainable params: 11,449,293
Non-trainable params: 0
_________________________________________________________________
In [30]:
model2.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=MODEL2['Lr']),
              metrics=['accuracy']
              )
In [31]:
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history2 = model2.fit(train_pad, train_encod,
                    batch_size=MODEL2['batch-size'],
                    epochs=MODEL2['epoch'],
                    validation_split=0.1,
                    verbose=1,
                    callbacks=[esCallback]
                    )
history2
Train on 8100 samples, validate on 900 samples
Epoch 1/50
8100/8100 [==============================] - 220s 27ms/sample - loss: 1.4692 - accuracy: 0.3202 - val_loss: 1.2958 - val_accuracy: 0.4067
Epoch 2/50
8100/8100 [==============================] - 219s 27ms/sample - loss: 1.2770 - accuracy: 0.4253 - val_loss: 1.2130 - val_accuracy: 0.4400
Epoch 3/50
8100/8100 [==============================] - 238s 29ms/sample - loss: 1.1596 - accuracy: 0.4747 - val_loss: 1.0015 - val_accuracy: 0.6000
Epoch 4/50
8100/8100 [==============================] - 195s 24ms/sample - loss: 0.8796 - accuracy: 0.6349 - val_loss: 0.6694 - val_accuracy: 0.7767
Epoch 5/50
8100/8100 [==============================] - 198s 24ms/sample - loss: 0.6520 - accuracy: 0.7604 - val_loss: 0.4319 - val_accuracy: 0.8767
Epoch 6/50
8100/8100 [==============================] - 200s 25ms/sample - loss: 0.5002 - accuracy: 0.8336 - val_loss: 0.3631 - val_accuracy: 0.9000
Epoch 7/50
8100/8100 [==============================] - 198s 24ms/sample - loss: 0.4120 - accuracy: 0.8731 - val_loss: 0.3491 - val_accuracy: 0.9044
Epoch 8/50
8100/8100 [==============================] - 203s 25ms/sample - loss: 0.3504 - accuracy: 0.8928 - val_loss: 0.3365 - val_accuracy: 0.9022
Epoch 9/50
8100/8100 [==============================] - 198s 24ms/sample - loss: 0.3020 - accuracy: 0.9088 - val_loss: 0.3453 - val_accuracy: 0.9044
Epoch 10/50
8100/8100 [==============================] - 199s 25ms/sample - loss: 0.2675 - accuracy: 0.9212 - val_loss: 0.3779 - val_accuracy: 0.9078
Epoch 11/50
8100/8100 [==============================] - 201s 25ms/sample - loss: 0.2312 - accuracy: 0.9323 - val_loss: 0.3903 - val_accuracy: 0.9144
Out[31]:
<tensorflow.python.keras.callbacks.History at 0x24aeccc9198>
In [36]:
score2 = model2.evaluate(test_pad, test_encod,
                       batch_size=MODEL2['batch-size'], verbose=2)
print('Test loss:', score2[0])
print('Test accuracy:', score2[1])
1000/1 - 5s - loss: 0.5143 - accuracy: 0.8930
Test loss: 0.47312470149993896
Test accuracy: 0.893
In [32]:
import matplotlib.pyplot as plt
acc = history2.history['accuracy']
val_acc = history2.history['val_accuracy']
loss = history2.history['loss']
val_loss = history2.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()
In [33]:
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
In [34]:
text_labels = encoder.classes_
y_pred = model2.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: 
               precision    recall  f1-score   support

     hiburan       0.85      0.77      0.81       202
    olahraga       0.97      0.99      0.98       191
     showbiz       0.84      0.90      0.87       193
 tajuk utama       0.84      0.87      0.86       187
   teknologi       0.95      0.93      0.94       227

    accuracy                           0.89      1000
   macro avg       0.89      0.89      0.89      1000
weighted avg       0.89      0.89      0.89      1000

In [35]:
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
Out[35]:
(<Figure size 432x432 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x24a8f0e6710>)

Model 4¶

In [37]:
MODEL4 = {'units':128, 'dropout':0.5, 'Lr':0.001, 'batch-size':128, 'epoch':50}
MODEL4
Out[37]:
{'units': 128, 'dropout': 0.5, 'Lr': 0.001, 'batch-size': 128, 'epoch': 50}
In [38]:
MAX_NB_WORDS = len(tokenizer.word_index)+1

model4 = Sequential([
    Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
    SpatialDropout1D(MODEL4['dropout']),
    LSTM(MODEL4['units'], dropout=MODEL4['dropout'], recurrent_dropout=MODEL4['dropout'], activation='tanh'),
    Dropout(MODEL4['dropout']),
    Dense(num_classes, activation='softmax')
])

model4.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 200, 200)          11280200  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 200)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               168448    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645       
=================================================================
Total params: 11,449,293
Trainable params: 11,449,293
Non-trainable params: 0
_________________________________________________________________
In [39]:
model4.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=MODEL4['Lr']),
              metrics=['accuracy']
              )
In [40]:
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history4 = model4.fit(train_pad, train_encod,
                    batch_size=MODEL4['batch-size'],
                    epochs=MODEL4['epoch'],
                    validation_split=0.1,
                    verbose=1,
                    callbacks=[esCallback]
                    )
history4
Train on 8100 samples, validate on 900 samples
Epoch 1/50
8100/8100 [==============================] - 195s 24ms/sample - loss: 1.5387 - accuracy: 0.2796 - val_loss: 1.3291 - val_accuracy: 0.3444
Epoch 2/50
8100/8100 [==============================] - 217s 27ms/sample - loss: 1.3621 - accuracy: 0.3685 - val_loss: 1.2854 - val_accuracy: 0.4144
Epoch 3/50
8100/8100 [==============================] - 203s 25ms/sample - loss: 1.3113 - accuracy: 0.3993 - val_loss: 1.2836 - val_accuracy: 0.4033
Epoch 4/50
8100/8100 [==============================] - 240s 30ms/sample - loss: 1.3006 - accuracy: 0.4221 - val_loss: 1.2304 - val_accuracy: 0.4222
Epoch 5/50
8100/8100 [==============================] - 201s 25ms/sample - loss: 1.2420 - accuracy: 0.4559 - val_loss: 1.1208 - val_accuracy: 0.4856
Epoch 6/50
8100/8100 [==============================] - 200s 25ms/sample - loss: 1.1096 - accuracy: 0.5389 - val_loss: 0.9832 - val_accuracy: 0.5956
Epoch 7/50
8100/8100 [==============================] - 248s 31ms/sample - loss: 1.0261 - accuracy: 0.5790 - val_loss: 0.9041 - val_accuracy: 0.6189
Epoch 8/50
8100/8100 [==============================] - 247s 30ms/sample - loss: 0.9798 - accuracy: 0.5984 - val_loss: 0.8874 - val_accuracy: 0.6267
Epoch 9/50
8100/8100 [==============================] - 202s 25ms/sample - loss: 0.9108 - accuracy: 0.6328 - val_loss: 0.8676 - val_accuracy: 0.6222
Epoch 10/50
8100/8100 [==============================] - 206s 25ms/sample - loss: 0.8689 - accuracy: 0.6407 - val_loss: 0.8545 - val_accuracy: 0.6278
Epoch 11/50
8100/8100 [==============================] - 204s 25ms/sample - loss: 0.8775 - accuracy: 0.6372 - val_loss: 0.7987 - val_accuracy: 0.6744
Epoch 12/50
8100/8100 [==============================] - 203s 25ms/sample - loss: 0.8300 - accuracy: 0.6662 - val_loss: 0.6321 - val_accuracy: 0.8033
Epoch 13/50
8100/8100 [==============================] - 215s 26ms/sample - loss: 0.7417 - accuracy: 0.7286 - val_loss: 0.5899 - val_accuracy: 0.7967
Epoch 14/50
8100/8100 [==============================] - 209s 26ms/sample - loss: 0.7310 - accuracy: 0.7354 - val_loss: 0.6281 - val_accuracy: 0.7889
Epoch 15/50
8100/8100 [==============================] - 213s 26ms/sample - loss: 0.7454 - accuracy: 0.7300 - val_loss: 0.5219 - val_accuracy: 0.8644
Epoch 16/50
8100/8100 [==============================] - 206s 25ms/sample - loss: 0.7185 - accuracy: 0.7602 - val_loss: 0.5529 - val_accuracy: 0.8389
Epoch 17/50
8100/8100 [==============================] - 201s 25ms/sample - loss: 0.7218 - accuracy: 0.7577 - val_loss: 0.5118 - val_accuracy: 0.8656
Epoch 18/50
8100/8100 [==============================] - 200s 25ms/sample - loss: 0.6211 - accuracy: 0.7974 - val_loss: 0.4528 - val_accuracy: 0.8711
Epoch 19/50
8100/8100 [==============================] - 201s 25ms/sample - loss: 0.5522 - accuracy: 0.8347 - val_loss: 0.4164 - val_accuracy: 0.8889
Epoch 20/50
8100/8100 [==============================] - 199s 25ms/sample - loss: 0.5333 - accuracy: 0.8383 - val_loss: 0.3924 - val_accuracy: 0.8978
Epoch 21/50
8100/8100 [==============================] - 202s 25ms/sample - loss: 0.5096 - accuracy: 0.8475 - val_loss: 0.3777 - val_accuracy: 0.9033
Epoch 22/50
8100/8100 [==============================] - 208s 26ms/sample - loss: 0.4911 - accuracy: 0.8531 - val_loss: 0.3857 - val_accuracy: 0.8944
Epoch 23/50
8100/8100 [==============================] - 200s 25ms/sample - loss: 0.4823 - accuracy: 0.8505 - val_loss: 0.3595 - val_accuracy: 0.9044
Epoch 24/50
8100/8100 [==============================] - 202s 25ms/sample - loss: 0.4260 - accuracy: 0.8746 - val_loss: 0.3565 - val_accuracy: 0.9067
Epoch 25/50
8100/8100 [==============================] - 200s 25ms/sample - loss: 0.3988 - accuracy: 0.8860 - val_loss: 0.3428 - val_accuracy: 0.9089
Epoch 26/50
8100/8100 [==============================] - 203s 25ms/sample - loss: 0.3843 - accuracy: 0.8860 - val_loss: 0.3399 - val_accuracy: 0.9111
Epoch 27/50
8100/8100 [==============================] - 199s 25ms/sample - loss: 0.3685 - accuracy: 0.8974 - val_loss: 0.3354 - val_accuracy: 0.9111
Epoch 28/50
8100/8100 [==============================] - 199s 25ms/sample - loss: 0.3559 - accuracy: 0.8995 - val_loss: 0.3250 - val_accuracy: 0.9122
Epoch 29/50
8100/8100 [==============================] - 200s 25ms/sample - loss: 0.3438 - accuracy: 0.9009 - val_loss: 0.3338 - val_accuracy: 0.9056
Epoch 30/50
8100/8100 [==============================] - 205s 25ms/sample - loss: 0.3379 - accuracy: 0.9037 - val_loss: 0.3234 - val_accuracy: 0.9000
Epoch 31/50
8100/8100 [==============================] - 199s 25ms/sample - loss: 0.3266 - accuracy: 0.9079 - val_loss: 0.3205 - val_accuracy: 0.9111
Epoch 32/50
8100/8100 [==============================] - 203s 25ms/sample - loss: 0.3051 - accuracy: 0.9180 - val_loss: 0.3269 - val_accuracy: 0.9133
Epoch 33/50
8100/8100 [==============================] - 201s 25ms/sample - loss: 0.2899 - accuracy: 0.9194 - val_loss: 0.3338 - val_accuracy: 0.9111
Epoch 34/50
8100/8100 [==============================] - 198s 24ms/sample - loss: 0.2843 - accuracy: 0.9219 - val_loss: 0.3160 - val_accuracy: 0.9156
Epoch 35/50
8100/8100 [==============================] - 209s 26ms/sample - loss: 0.2524 - accuracy: 0.9311 - val_loss: 0.3330 - val_accuracy: 0.9156
Epoch 36/50
8100/8100 [==============================] - 199s 25ms/sample - loss: 0.2350 - accuracy: 0.9411 - val_loss: 0.3210 - val_accuracy: 0.9200
Epoch 37/50
8100/8100 [==============================] - 199s 25ms/sample - loss: 0.2930 - accuracy: 0.9257 - val_loss: 0.3604 - val_accuracy: 0.9067
Out[40]:
<tensorflow.python.keras.callbacks.History at 0x24a90ad7748>
In [41]:
score4 = model4.evaluate(test_pad, test_encod,
                       batch_size=MODEL4['batch-size'], verbose=2)
print('Test loss:', score4[0])
print('Test accuracy:', score4[1])
1000/1 - 5s - loss: 0.4530 - accuracy: 0.8910
Test loss: 0.42583607721328737
Test accuracy: 0.891
In [42]:
import matplotlib.pyplot as plt
acc = history4.history['accuracy']
val_acc = history4.history['val_accuracy']
loss = history4.history['loss']
val_loss = history4.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()
In [43]:
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
In [44]:
text_labels = encoder.classes_
y_pred = model4.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: 
               precision    recall  f1-score   support

     hiburan       0.79      0.83      0.81       202
    olahraga       0.97      0.98      0.98       191
     showbiz       0.88      0.90      0.89       193
 tajuk utama       0.83      0.88      0.85       187
   teknologi       0.98      0.87      0.92       227

    accuracy                           0.89      1000
   macro avg       0.89      0.89      0.89      1000
weighted avg       0.90      0.89      0.89      1000

In [45]:
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
Out[45]:
(<Figure size 432x432 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x24a9cbad908>)
In [ ]:
 

Model 5¶

In [28]:
# Pembagian Datasets 70:30
MODEL7 = {'units':128, 'dropout':0.5, 'Lr':0.001, 'batch-size':128, 'epoch':50}
MODEL7
Out[28]:
{'units': 128, 'dropout': 0.5, 'Lr': 0.001, 'batch-size': 128, 'epoch': 50}
In [29]:
MAX_NB_WORDS = len(tokenizer.word_index)+1

model7 = Sequential([
    Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
    SpatialDropout1D(MODEL7['dropout']),
    LSTM(MODEL7['units'], dropout=MODEL7['dropout'], recurrent_dropout=MODEL7['dropout'], activation='tanh'),
    Dropout(MODEL7['dropout']),
    Dense(num_classes, activation='softmax')
])

model7.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 200, 200)          12833800  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 200)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               168448    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 645       
=================================================================
Total params: 13,002,893
Trainable params: 13,002,893
Non-trainable params: 0
_________________________________________________________________
In [30]:
model7.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=MODEL7['Lr']),
              metrics=['accuracy']
              )
In [31]:
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history7 = model7.fit(train_pad, train_encod,
                    batch_size=MODEL7['batch-size'],
                    epochs=MODEL7['epoch'],
                    validation_split=0.1,
                    verbose=1,
                    callbacks=[esCallback]
                    )
history7
Train on 6300 samples, validate on 700 samples
Epoch 1/50
6300/6300 [==============================] - 141s 22ms/sample - loss: 1.5600 - accuracy: 0.2692 - val_loss: 1.3414 - val_accuracy: 0.3543
Epoch 2/50
6300/6300 [==============================] - 146s 23ms/sample - loss: 1.4482 - accuracy: 0.3479 - val_loss: 1.3195 - val_accuracy: 0.3700
Epoch 3/50
6300/6300 [==============================] - 156s 25ms/sample - loss: 1.3575 - accuracy: 0.3778 - val_loss: 1.2755 - val_accuracy: 0.3743
Epoch 4/50
6300/6300 [==============================] - 156s 25ms/sample - loss: 1.3123 - accuracy: 0.3994 - val_loss: 1.1706 - val_accuracy: 0.5386
Epoch 5/50
6300/6300 [==============================] - 159s 25ms/sample - loss: 1.2719 - accuracy: 0.4498 - val_loss: 1.1580 - val_accuracy: 0.5271
Epoch 6/50
6300/6300 [==============================] - 159s 25ms/sample - loss: 1.2186 - accuracy: 0.4813 - val_loss: 1.0657 - val_accuracy: 0.5557
Epoch 7/50
6300/6300 [==============================] - 175s 28ms/sample - loss: 1.1794 - accuracy: 0.5076 - val_loss: 1.0112 - val_accuracy: 0.6071
Epoch 8/50
6300/6300 [==============================] - 165s 26ms/sample - loss: 1.1136 - accuracy: 0.5446 - val_loss: 0.9531 - val_accuracy: 0.6014
Epoch 9/50
6300/6300 [==============================] - 171s 27ms/sample - loss: 1.0233 - accuracy: 0.5803 - val_loss: 0.9087 - val_accuracy: 0.6129
Epoch 10/50
6300/6300 [==============================] - 180s 29ms/sample - loss: 1.0037 - accuracy: 0.5948 - val_loss: 0.9252 - val_accuracy: 0.6186
Epoch 11/50
6300/6300 [==============================] - 164s 26ms/sample - loss: 0.9599 - accuracy: 0.6087 - val_loss: 0.8704 - val_accuracy: 0.6257
Epoch 12/50
6300/6300 [==============================] - 172s 27ms/sample - loss: 0.9051 - accuracy: 0.6221 - val_loss: 0.8552 - val_accuracy: 0.6271
Epoch 13/50
6300/6300 [==============================] - 165s 26ms/sample - loss: 0.9043 - accuracy: 0.6229 - val_loss: 0.8252 - val_accuracy: 0.6371
Epoch 14/50
6300/6300 [==============================] - 214s 34ms/sample - loss: 0.8730 - accuracy: 0.6408 - val_loss: 0.8445 - val_accuracy: 0.6329
Epoch 15/50
6300/6300 [==============================] - 158s 25ms/sample - loss: 0.8921 - accuracy: 0.6460 - val_loss: 0.9585 - val_accuracy: 0.6357
Epoch 16/50
6300/6300 [==============================] - 151s 24ms/sample - loss: 0.8818 - accuracy: 0.6463 - val_loss: 0.8213 - val_accuracy: 0.6514
Epoch 17/50
6300/6300 [==============================] - 160s 25ms/sample - loss: 0.8303 - accuracy: 0.6633 - val_loss: 0.8147 - val_accuracy: 0.6414
Epoch 18/50
6300/6300 [==============================] - 168s 27ms/sample - loss: 0.8139 - accuracy: 0.6552 - val_loss: 0.8035 - val_accuracy: 0.6471
Epoch 19/50
6300/6300 [==============================] - 160s 25ms/sample - loss: 0.7886 - accuracy: 0.6694 - val_loss: 0.8083 - val_accuracy: 0.6514
Epoch 20/50
6300/6300 [==============================] - 160s 25ms/sample - loss: 0.7839 - accuracy: 0.6870 - val_loss: 0.6593 - val_accuracy: 0.7800
Epoch 21/50
6300/6300 [==============================] - 153s 24ms/sample - loss: 0.7853 - accuracy: 0.6910 - val_loss: 0.7030 - val_accuracy: 0.7314
Epoch 22/50
6300/6300 [==============================] - 162s 26ms/sample - loss: 0.7441 - accuracy: 0.7083 - val_loss: 0.6170 - val_accuracy: 0.7971
Epoch 23/50
6300/6300 [==============================] - 149s 24ms/sample - loss: 0.7338 - accuracy: 0.7297 - val_loss: 0.6490 - val_accuracy: 0.7571
Epoch 24/50
6300/6300 [==============================] - 149s 24ms/sample - loss: 0.6995 - accuracy: 0.7492 - val_loss: 0.6165 - val_accuracy: 0.8071
Epoch 25/50
6300/6300 [==============================] - 153s 24ms/sample - loss: 0.6659 - accuracy: 0.7630 - val_loss: 0.5904 - val_accuracy: 0.8214
Epoch 26/50
6300/6300 [==============================] - 151s 24ms/sample - loss: 0.6304 - accuracy: 0.7790 - val_loss: 0.7150 - val_accuracy: 0.7571
Epoch 27/50
6300/6300 [==============================] - 148s 24ms/sample - loss: 0.7118 - accuracy: 0.7470 - val_loss: 0.6387 - val_accuracy: 0.7500
Epoch 28/50
6300/6300 [==============================] - 147s 23ms/sample - loss: 0.7015 - accuracy: 0.7494 - val_loss: 0.5983 - val_accuracy: 0.8214
Out[31]:
<tensorflow.python.keras.callbacks.History at 0x2aea4b6e4e0>
In [33]:
score7 = model7.evaluate(test_pad, test_encod,
                       batch_size=MODEL7['batch-size'], verbose=2)
print('Test loss:', score7[0])
print('Test accuracy:', score7[1])
3000/1 - 15s - loss: 0.6587 - accuracy: 0.7910
Test loss: 0.6494710607528686
Test accuracy: 0.791
In [34]:
import matplotlib.pyplot as plt
acc = history7.history['accuracy']
val_acc = history7.history['val_accuracy']
loss = history7.history['loss']
val_loss = history7.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()
In [35]:
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
In [36]:
text_labels = encoder.classes_
y_pred = model7.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: 
               precision    recall  f1-score   support

     hiburan       0.87      0.61      0.72       613
    olahraga       0.97      0.92      0.95       603
     showbiz       0.75      0.61      0.67       590
 tajuk utama       0.56      0.93      0.70       564
   teknologi       0.95      0.88      0.92       630

    accuracy                           0.79      3000
   macro avg       0.82      0.79      0.79      3000
weighted avg       0.83      0.79      0.79      3000

In [37]:
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
Out[37]:
(<Figure size 432x432 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x2aead6e86a0>)

Model 6¶

In [28]:
# Pembagian Datasets 80:20
MODEL8 = {'units':128, 'dropout':0.5, 'Lr':0.001, 'batch-size':128, 'epoch':50}
MODEL8
Out[28]:
{'units': 128, 'dropout': 0.5, 'Lr': 0.001, 'batch-size': 128, 'epoch': 50}
In [29]:
MAX_NB_WORDS = len(tokenizer.word_index)+1

model8 = Sequential([
    Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
    SpatialDropout1D(MODEL8['dropout']),
    LSTM(MODEL8['units'], dropout=MODEL8['dropout'], recurrent_dropout=MODEL8['dropout'], activation='tanh'),
    Dropout(MODEL8['dropout']),
    Dense(num_classes, activation='softmax')
])

model8.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 200, 200)          13675800  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 200)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               168448    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 645       
=================================================================
Total params: 13,844,893
Trainable params: 13,844,893
Non-trainable params: 0
_________________________________________________________________
In [30]:
model8.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=MODEL8['Lr']),
              metrics=['accuracy']
              )
In [31]:
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history8 = model8.fit(train_pad, train_encod,
                    batch_size=MODEL8['batch-size'],
                    epochs=MODEL8['epoch'],
                    validation_split=0.1,
                    verbose=1,
                    callbacks=[esCallback]
                    )
history8
Train on 7200 samples, validate on 800 samples
Epoch 1/50
7200/7200 [==============================] - 150s 21ms/sample - loss: 1.5368 - accuracy: 0.2719 - val_loss: 1.3265 - val_accuracy: 0.3675
Epoch 2/50
7200/7200 [==============================] - 152s 21ms/sample - loss: 1.3534 - accuracy: 0.3676 - val_loss: 1.3147 - val_accuracy: 0.3837
Epoch 3/50
7200/7200 [==============================] - 156s 22ms/sample - loss: 1.3300 - accuracy: 0.3789 - val_loss: 1.2552 - val_accuracy: 0.4238
Epoch 4/50
7200/7200 [==============================] - 154s 21ms/sample - loss: 1.2847 - accuracy: 0.4203 - val_loss: 1.2063 - val_accuracy: 0.4363
Epoch 5/50
7200/7200 [==============================] - 160s 22ms/sample - loss: 1.2374 - accuracy: 0.4556 - val_loss: 1.1674 - val_accuracy: 0.4600
Epoch 6/50
7200/7200 [==============================] - 162s 22ms/sample - loss: 1.1778 - accuracy: 0.4926 - val_loss: 1.1220 - val_accuracy: 0.5088
Epoch 7/50
7200/7200 [==============================] - 163s 23ms/sample - loss: 1.1413 - accuracy: 0.5167 - val_loss: 0.9792 - val_accuracy: 0.5950
Epoch 8/50
7200/7200 [==============================] - 166s 23ms/sample - loss: 1.0838 - accuracy: 0.5681 - val_loss: 0.8655 - val_accuracy: 0.6350
Epoch 9/50
7200/7200 [==============================] - 165s 23ms/sample - loss: 0.9854 - accuracy: 0.5890 - val_loss: 0.8593 - val_accuracy: 0.6200
Epoch 10/50
7200/7200 [==============================] - 164s 23ms/sample - loss: 0.8914 - accuracy: 0.6294 - val_loss: 0.8244 - val_accuracy: 0.6513
Epoch 11/50
7200/7200 [==============================] - 172s 24ms/sample - loss: 0.9085 - accuracy: 0.6199 - val_loss: 0.8427 - val_accuracy: 0.6363
Epoch 12/50
7200/7200 [==============================] - 168s 23ms/sample - loss: 0.8556 - accuracy: 0.6406 - val_loss: 0.8318 - val_accuracy: 0.6363
Epoch 13/50
7200/7200 [==============================] - 164s 23ms/sample - loss: 0.8174 - accuracy: 0.6447 - val_loss: 0.8116 - val_accuracy: 0.6562
Epoch 14/50
7200/7200 [==============================] - 168s 23ms/sample - loss: 0.7722 - accuracy: 0.6658 - val_loss: 0.7700 - val_accuracy: 0.6675
Epoch 15/50
7200/7200 [==============================] - 164s 23ms/sample - loss: 0.7747 - accuracy: 0.6686 - val_loss: 0.7495 - val_accuracy: 0.6750
Epoch 16/50
7200/7200 [==============================] - 162s 22ms/sample - loss: 0.7518 - accuracy: 0.6726 - val_loss: 0.7579 - val_accuracy: 0.6637
Epoch 17/50
7200/7200 [==============================] - 167s 23ms/sample - loss: 0.7360 - accuracy: 0.6746 - val_loss: 0.7768 - val_accuracy: 0.6637
Epoch 18/50
7200/7200 [==============================] - 168s 23ms/sample - loss: 0.7156 - accuracy: 0.6953 - val_loss: 0.6634 - val_accuracy: 0.7225
Epoch 19/50
7200/7200 [==============================] - 166s 23ms/sample - loss: 0.6522 - accuracy: 0.7357 - val_loss: 0.5419 - val_accuracy: 0.8250
Epoch 20/50
7200/7200 [==============================] - 175s 24ms/sample - loss: 0.6047 - accuracy: 0.7771 - val_loss: 0.5249 - val_accuracy: 0.8100
Epoch 21/50
7200/7200 [==============================] - 170s 24ms/sample - loss: 0.5678 - accuracy: 0.8043 - val_loss: 0.4025 - val_accuracy: 0.9150
Epoch 22/50
7200/7200 [==============================] - 161s 22ms/sample - loss: 0.5569 - accuracy: 0.8226 - val_loss: 0.4411 - val_accuracy: 0.8950
Epoch 23/50
7200/7200 [==============================] - 163s 23ms/sample - loss: 0.5474 - accuracy: 0.8235 - val_loss: 0.4338 - val_accuracy: 0.9013
Epoch 24/50
7200/7200 [==============================] - 163s 23ms/sample - loss: 0.5426 - accuracy: 0.8343 - val_loss: 0.4076 - val_accuracy: 0.9075
Out[31]:
<tensorflow.python.keras.callbacks.History at 0x1e83b4d5a58>
In [32]:
score8 = model8.evaluate(test_pad, test_encod,
                       batch_size=MODEL8['batch-size'], verbose=2)
print('Test loss:', score8[0])
print('Test accuracy:', score8[1])
2000/1 - 12s - loss: 0.4838 - accuracy: 0.8830
Test loss: 0.49501818656921387
Test accuracy: 0.883
In [33]:
import matplotlib.pyplot as plt
acc = history8.history['accuracy']
val_acc = history8.history['val_accuracy']
loss = history8.history['loss']
val_loss = history8.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()
In [34]:
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
In [35]:
text_labels = encoder.classes_
y_pred = model8.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: 
               precision    recall  f1-score   support

     hiburan       0.83      0.76      0.79       408
    olahraga       0.96      0.95      0.96       397
     showbiz       0.80      0.92      0.86       382
 tajuk utama       0.85      0.87      0.86       382
   teknologi       0.97      0.91      0.94       431

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.89      0.88      0.88      2000

In [36]:
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
Out[36]:
(<Figure size 432x432 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1e834334ac8>)