import os
import re
import string
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam, Adamax, SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, SpatialDropout1D
from tensorflow.keras import regularizers
import random as rn
tf.random.set_seed(123)
np.random.RandomState(123)
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(123)
rn.seed(123)
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] /home/iwanxone/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
True
pathh = r"./Dataset/indosum datasets"
df = pd.read_csv(pathh, header=None, delimiter='\t')
df.columns = ['category','source', 'article']
df.head()
category | source | article | |
---|---|---|---|
0 | teknologi | kumparan | Uber pada hari Jumat mengatakan akan menguak d... |
1 | teknologi | dailysocial.id | Menyusul jejak NES Classic Edition , SNES Clas... |
2 | teknologi | dailysocial.id | MDI Ventures , perusahaan modal ventura yang d... |
3 | teknologi | kumparan | Mazda masih menutup rapat informasi soal sport... |
4 | teknologi | dailysocial.id | Sampai di akhir tahun 2017 ini , frasa “ print... |
df.isna().sum()
category 0 source 0 article 0 dtype: int64
df['category'].value_counts()
teknologi 2000 tajuk utama 2000 showbiz 2000 olahraga 2000 hiburan 2000 Name: category, dtype: int64
df.groupby(df['category']).count()
source | article | |
---|---|---|
category | ||
hiburan | 2000 | 2000 |
olahraga | 2000 | 2000 |
showbiz | 2000 | 2000 |
tajuk utama | 2000 | 2000 |
teknologi | 2000 | 2000 |
import seaborn as sns
import matplotlib.pyplot as plt
import warnings as wrn
wrn.filterwarnings('ignore', category = DeprecationWarning)
wrn.filterwarnings('ignore', category = FutureWarning)
wrn.filterwarnings('ignore', category = UserWarning)
wrn.filterwarnings('ignore', category = RuntimeWarning)
plt.title('Category Count')
sns.countplot(df['category'])
<AxesSubplot:title={'center':'Category Count'}, xlabel='category', ylabel='count'>
def filtered(text):
"""
Buat teks menjadi huruf kecil, hapus teks dalam tanda kurung siku, hapus tautan,
hapus tanda baca dan hapus kata-kata yang mengandung angka.
"""
text = text.lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', ' ', text)
text = re.sub('\w*\d\w*', '', text)
return text
def stopwordsRemoval(text):
"""
Fungsi ini digunakan untuk mengfilter kata pada corpus
dan mengembalikan kata-kata penting dari tiap-tiap dokumen.
"""
tokenizer = nltk.RegexpTokenizer(r"\w+")
tokenizer = tokenizer.tokenize(text)
listStopwords = set(stopwords.words('indonesian')+stopwords.words('english'))
filtered = [s for s in tokenizer if s not in listStopwords]
textFiltered = ' '.join(filtered)
return textFiltered
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()
df['text_clean'] = df['article'].apply(lambda x: filtered(x))
df['stopword_removal'] = df['text_clean'].apply(lambda x: stopwordsRemoval(x))
# df['stemming'] = df['stopword_removal'].apply(lambda x: stemmer.stem(x))
df['Number_of_words'] = df['stopword_removal'].apply(lambda x:len(str(x).split()))
df.loc[0:5, ['stopword_removal','Number_of_words']]
stopword_removal | Number_of_words | |
---|---|---|
0 | uber jumat menguak data perjalanan paris publi... | 142 |
1 | menyusul jejak nes classic edition snes classi... | 166 |
2 | mdi ventures perusahaan modal ventura didukung... | 291 |
3 | mazda menutup rapat informasi sportscar anyar ... | 131 |
4 | frasa printer mencetak video terdengar mustahi... | 140 |
5 | dxo one aksesori mungil berfungsi mengubah iph... | 156 |
df['Number_of_words'].describe()
count 10000.000000 mean 170.040400 std 68.460311 min 20.000000 25% 124.000000 50% 158.000000 75% 205.000000 max 662.000000 Name: Number_of_words, dtype: float64
X_train, X_test, y_train, y_test = train_test_split(df['stopword_removal'],
df['category'],
test_size=0.1,
random_state=25)
X_train
9189 jakarta cnn indonesia suzzanna ikon film horor... 6354 jakarta cnn indonesia kapten persib bandung at... 2217 jakarta cnn indonesia tujuh siswa setingkat sm... 5663 jakarta cnn indonesia musik punk mati grup leg... 5102 korea selatan bts meraih sukses industri musik... ... 1175 true money indonesia salah perusahaan uang ele... 8447 hidrogen peroksida alias cairan asam lemah ber... 2934 jakarta cnn indonesia polda metro jaya berkoor... 6618 juaranet laga digelar seri indonesian basketba... 8510 coca cola menegak segelas coca cola dingin sen... Name: stopword_removal, Length: 9000, dtype: object
X_test.index[:25]
Int64Index([3555, 4078, 8445, 5939, 5583, 1656, 5550, 1736, 6297, 6364, 6341, 2802, 8579, 2351, 877, 5844, 129, 7234, 5704, 6047, 79, 2969, 5948, 8906, 9914], dtype='int64')
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 200
tokenizer = Tokenizer(oov_token='UNK', lower=True)
tokenizer.fit_on_texts(X_train)
train_seq = tokenizer.texts_to_sequences(X_train)
train_pad = pad_sequences(train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_seq = tokenizer.texts_to_sequences(X_test)
test_pad = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
train_pad
array([[ 4906, 2713, 7172, ..., 1240, 1078, 1078], [ 3, 12, 2, ..., 0, 0, 0], [ 3, 12, 2, ..., 0, 0, 0], ..., [ 3, 12, 2, ..., 0, 0, 0], [ 481, 33, 188, ..., 0, 0, 0], [ 7671, 6887, 38458, ..., 0, 0, 0]], dtype=int32)
encoder = LabelEncoder()
encoder.fit(y_train)
train_encod = encoder.transform(y_train)
test_encod = encoder.transform(y_test)
num_classes = np.max(train_encod) + 1
train_encod = to_categorical(train_encod, num_classes)
test_encod = to_categorical(test_encod, num_classes)
print(train_encod)
[[1. 0. 0. 0. 0.] [0. 1. 0. 0. 0.] [0. 0. 0. 1. 0.] ... [0. 0. 0. 1. 0.] [0. 1. 0. 0. 0.] [1. 0. 0. 0. 0.]]
print('X_train', train_pad.shape)
print('X_test', test_pad.shape)
print('y_train', train_encod.shape)
print('y_test', test_encod.shape)
X_train (9000, 200) X_test (1000, 200) y_train (9000, 5) y_test (1000, 5)
MODEL2 = {'units':128, 'dropout':0.5, 'Lr':0.01, 'batch-size':128, 'epoch':50}
MAX_NB_WORDS = len(tokenizer.word_index)+1
model2 = Sequential([
Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
SpatialDropout1D(MODEL2['dropout']),
LSTM(MODEL2['units'], dropout=MODEL2['dropout'], recurrent_dropout=MODEL2['dropout'], activation='tanh'),
Dropout(MODEL2['dropout']),
Dense(num_classes, activation='softmax')
])
model2.summary()
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_2 (Embedding) (None, 200, 200) 14488600 spatial_dropout1d_2 (Spatia (None, 200, 200) 0 lDropout1D) lstm_2 (LSTM) (None, 128) 168448 dropout_2 (Dropout) (None, 128) 0 dense_2 (Dense) (None, 5) 645 ================================================================= Total params: 14,657,693 Trainable params: 14,657,693 Non-trainable params: 0 _________________________________________________________________
model2.compile(loss='categorical_crossentropy',
optimizer=Adam(learning_rate=MODEL2['Lr']),
metrics=['accuracy']
)
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history2 = model2.fit(train_pad, train_encod,
batch_size=MODEL2['batch-size'],
epochs=MODEL2['epoch'],
validation_split=0.1,
verbose=1,
callbacks=[esCallback]
)
history2
Epoch 1/50 64/64 [==============================] - 266s 4s/step - loss: 1.4531 - accuracy: 0.3305 - val_loss: 1.3656 - val_accuracy: 0.3711 Epoch 2/50 64/64 [==============================] - 313s 5s/step - loss: 1.4909 - accuracy: 0.3067 - val_loss: 1.4505 - val_accuracy: 0.3422 Epoch 3/50 64/64 [==============================] - 293s 5s/step - loss: 1.3740 - accuracy: 0.3693 - val_loss: 1.2777 - val_accuracy: 0.4267 Epoch 4/50 64/64 [==============================] - 257s 4s/step - loss: 1.2727 - accuracy: 0.4252 - val_loss: 1.2258 - val_accuracy: 0.4689 Epoch 5/50 64/64 [==============================] - 246s 4s/step - loss: 1.1278 - accuracy: 0.5100 - val_loss: 0.9642 - val_accuracy: 0.5800 Epoch 6/50 64/64 [==============================] - 268s 4s/step - loss: 0.9500 - accuracy: 0.5977 - val_loss: 0.8386 - val_accuracy: 0.6178 Epoch 7/50 64/64 [==============================] - 247s 4s/step - loss: 0.8544 - accuracy: 0.6326 - val_loss: 0.8439 - val_accuracy: 0.6667 Epoch 8/50 64/64 [==============================] - 231s 4s/step - loss: 0.7210 - accuracy: 0.6864 - val_loss: 0.7591 - val_accuracy: 0.6933 Epoch 9/50 64/64 [==============================] - 244s 4s/step - loss: 0.5938 - accuracy: 0.7498 - val_loss: 0.7377 - val_accuracy: 0.7022 Epoch 10/50 64/64 [==============================] - 201s 3s/step - loss: 0.4907 - accuracy: 0.7986 - val_loss: 0.6381 - val_accuracy: 0.7656 Epoch 11/50 64/64 [==============================] - 241s 4s/step - loss: 0.4244 - accuracy: 0.8412 - val_loss: 0.7605 - val_accuracy: 0.7389 Epoch 12/50 64/64 [==============================] - 243s 4s/step - loss: 0.3431 - accuracy: 0.8774 - val_loss: 0.5298 - val_accuracy: 0.8389 Epoch 13/50 64/64 [==============================] - 216s 3s/step - loss: 0.2638 - accuracy: 0.9102 - val_loss: 0.5396 - val_accuracy: 0.8500 Epoch 14/50 64/64 [==============================] - 261s 4s/step - loss: 0.2064 - accuracy: 0.9386 - val_loss: 0.5167 - val_accuracy: 0.8644 Epoch 15/50 64/64 [==============================] - 249s 4s/step - loss: 0.1807 - accuracy: 0.9500 - val_loss: 0.6138 - val_accuracy: 0.8367 Epoch 16/50 64/64 [==============================] - 349s 5s/step - loss: 0.1636 - accuracy: 0.9568 - val_loss: 0.5345 - val_accuracy: 0.8578 Epoch 17/50 64/64 [==============================] - 318s 5s/step - loss: 0.1423 - accuracy: 0.9625 - val_loss: 0.6736 - val_accuracy: 0.8611
<keras.callbacks.History at 0x7f609188f910>
score2 = model2.evaluate(test_pad, test_encod,
batch_size=MODEL2['batch-size'], verbose=2)
print('Test loss:', score2[0])
print('Test accuracy:', score2[1])
8/8 - 16s - loss: 0.6910 - accuracy: 0.8660 - 16s/epoch - 2s/step Test loss: 0.6910145878791809 Test accuracy: 0.8659999966621399
import matplotlib.pyplot as plt
acc = history2.history['accuracy']
val_acc = history2.history['val_accuracy']
loss = history2.history['loss']
val_loss = history2.history['val_loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
text_labels = encoder.classes_
y_pred = model2.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: precision recall f1-score support hiburan 0.80 0.77 0.79 202 olahraga 0.98 0.99 0.99 191 showbiz 0.87 0.87 0.87 193 tajuk utama 0.84 0.83 0.83 187 teknologi 0.92 0.96 0.94 227 accuracy 0.88 1000 macro avg 0.88 0.88 0.88 1000 weighted avg 0.88 0.88 0.88 1000
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
(<Figure size 432x432 with 1 Axes>, <matplotlib.axes._subplots.AxesSubplot at 0x214b350a7b8>)
MODEL4 = {'units':128, 'dropout':0.5, 'Lr':0.001, 'batch-size':128, 'epoch':50}
MODEL4
{'units': 128, 'dropout': 0.5, 'Lr': 0.001, 'batch-size': 128, 'epoch': 50}
MAX_NB_WORDS = len(tokenizer.word_index)+1
model4 = Sequential([
Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
SpatialDropout1D(MODEL4['dropout']),
LSTM(MODEL4['units'], dropout=MODEL4['dropout'], recurrent_dropout=MODEL4['dropout'], activation='tanh'),
Dropout(MODEL4['dropout']),
Dense(num_classes, activation='softmax')
])
model4.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 200, 200) 14488600 _________________________________________________________________ spatial_dropout1d (SpatialDr (None, 200, 200) 0 _________________________________________________________________ lstm (LSTM) (None, 128) 168448 _________________________________________________________________ dropout (Dropout) (None, 128) 0 _________________________________________________________________ dense (Dense) (None, 5) 645 ================================================================= Total params: 14,657,693 Trainable params: 14,657,693 Non-trainable params: 0 _________________________________________________________________
model4.compile(loss='categorical_crossentropy',
optimizer=Adam(learning_rate=MODEL4['Lr']),
metrics=['accuracy']
)
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history4 = model4.fit(train_pad, train_encod,
batch_size=MODEL4['batch-size'],
epochs=MODEL4['epoch'],
validation_split=0.1,
verbose=1,
callbacks=[esCallback]
)
history4
Train on 8100 samples, validate on 900 samples Epoch 1/50 8100/8100 [==============================] - 221s 27ms/sample - loss: 1.5289 - accuracy: 0.2894 - val_loss: 1.2912 - val_accuracy: 0.3622 Epoch 2/50 8100/8100 [==============================] - 175s 22ms/sample - loss: 1.3522 - accuracy: 0.3723 - val_loss: 1.2809 - val_accuracy: 0.3722 Epoch 3/50 8100/8100 [==============================] - 186s 23ms/sample - loss: 1.3059 - accuracy: 0.3868 - val_loss: 1.2999 - val_accuracy: 0.3933 Epoch 4/50 8100/8100 [==============================] - 217s 27ms/sample - loss: 1.3473 - accuracy: 0.3936 - val_loss: 1.2758 - val_accuracy: 0.4189 Epoch 5/50 8100/8100 [==============================] - 230s 28ms/sample - loss: 1.2571 - accuracy: 0.4374 - val_loss: 1.1798 - val_accuracy: 0.4544 Epoch 6/50 8100/8100 [==============================] - 218s 27ms/sample - loss: 1.1938 - accuracy: 0.4779 - val_loss: 1.0963 - val_accuracy: 0.4922 Epoch 7/50 8100/8100 [==============================] - 202s 25ms/sample - loss: 1.0084 - accuracy: 0.5653 - val_loss: 0.8115 - val_accuracy: 0.6322 Epoch 8/50 8100/8100 [==============================] - 203s 25ms/sample - loss: 0.9038 - accuracy: 0.6181 - val_loss: 0.7902 - val_accuracy: 0.6444 Epoch 9/50 8100/8100 [==============================] - 207s 26ms/sample - loss: 0.8581 - accuracy: 0.6364 - val_loss: 0.7853 - val_accuracy: 0.6511 Epoch 10/50 8100/8100 [==============================] - 211s 26ms/sample - loss: 0.8249 - accuracy: 0.6532 - val_loss: 0.7714 - val_accuracy: 0.6467 Epoch 11/50 8100/8100 [==============================] - 196s 24ms/sample - loss: 0.7869 - accuracy: 0.6623 - val_loss: 0.7675 - val_accuracy: 0.6511 Epoch 12/50 8100/8100 [==============================] - 194s 24ms/sample - loss: 0.7846 - accuracy: 0.6602 - val_loss: 0.7538 - val_accuracy: 0.6533 Epoch 13/50 8100/8100 [==============================] - 194s 24ms/sample - loss: 0.7492 - accuracy: 0.6748 - val_loss: 0.7285 - val_accuracy: 0.6667 Epoch 14/50 8100/8100 [==============================] - 195s 24ms/sample - loss: 0.7190 - accuracy: 0.6917 - val_loss: 0.7093 - val_accuracy: 0.6833 Epoch 15/50 8100/8100 [==============================] - 194s 24ms/sample - loss: 0.6722 - accuracy: 0.7312 - val_loss: 0.5539 - val_accuracy: 0.8056 Epoch 16/50 8100/8100 [==============================] - 195s 24ms/sample - loss: 0.6188 - accuracy: 0.7607 - val_loss: 0.5385 - val_accuracy: 0.8300 Epoch 17/50 8100/8100 [==============================] - 198s 24ms/sample - loss: 0.5956 - accuracy: 0.7780 - val_loss: 0.5009 - val_accuracy: 0.8300 Epoch 18/50 8100/8100 [==============================] - 195s 24ms/sample - loss: 0.5483 - accuracy: 0.8046 - val_loss: 0.4034 - val_accuracy: 0.8911 Epoch 19/50 8100/8100 [==============================] - 199s 25ms/sample - loss: 0.4991 - accuracy: 0.8460 - val_loss: 0.4170 - val_accuracy: 0.8944 Epoch 20/50 8100/8100 [==============================] - 214s 26ms/sample - loss: 0.4419 - accuracy: 0.8738 - val_loss: 0.3828 - val_accuracy: 0.9011 Epoch 21/50 8100/8100 [==============================] - 212s 26ms/sample - loss: 0.4289 - accuracy: 0.8784 - val_loss: 0.3702 - val_accuracy: 0.9044 Epoch 22/50 8100/8100 [==============================] - 188s 23ms/sample - loss: 0.3966 - accuracy: 0.8879 - val_loss: 0.3891 - val_accuracy: 0.9022 Epoch 23/50 8100/8100 [==============================] - 191s 24ms/sample - loss: 0.3494 - accuracy: 0.9056 - val_loss: 0.3387 - val_accuracy: 0.9100 Epoch 24/50 8100/8100 [==============================] - 192s 24ms/sample - loss: 0.3348 - accuracy: 0.9125 - val_loss: 0.3431 - val_accuracy: 0.9067 Epoch 25/50 8100/8100 [==============================] - 192s 24ms/sample - loss: 0.3136 - accuracy: 0.9173 - val_loss: 0.3425 - val_accuracy: 0.9133 Epoch 26/50 8100/8100 [==============================] - 189s 23ms/sample - loss: 0.2986 - accuracy: 0.9226 - val_loss: 0.3509 - val_accuracy: 0.9200
<tensorflow.python.keras.callbacks.History at 0x1bb5a461b38>
score4 = model4.evaluate(test_pad, test_encod,
batch_size=MODEL4['batch-size'], verbose=2)
print('Test loss:', score4[0])
print('Test accuracy:', score4[1])
1000/1 - 5s - loss: 0.4444 - accuracy: 0.9070 Test loss: 0.3792504758834839 Test accuracy: 0.907
import matplotlib.pyplot as plt
acc = history4.history['accuracy']
val_acc = history4.history['val_accuracy']
loss = history4.history['loss']
val_loss = history4.history['val_loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
text_labels = encoder.classes_
y_pred = model4.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: precision recall f1-score support hiburan 0.86 0.80 0.83 202 olahraga 0.97 0.98 0.98 191 showbiz 0.87 0.93 0.90 193 tajuk utama 0.90 0.87 0.88 187 teknologi 0.93 0.96 0.94 227 accuracy 0.91 1000 macro avg 0.91 0.91 0.91 1000 weighted avg 0.91 0.91 0.91 1000
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
(<Figure size 432x432 with 1 Axes>, <matplotlib.axes._subplots.AxesSubplot at 0x1bb0241a080>)
lstm = load_model('D:\Basic Natural Languange Processing\Model\LSTM_model91.h5')
# Memulai melakukan prediksi hasil terhadap dataset yang ada
text_labels = encoder.classes_
for i in range(1):
prediction = lstm.predict(np.array([test_pad[i]]))
predicted_label = text_labels[np.argmax(prediction)]
print(X_test.iloc[i][:50], "...")
print('Label awal: ' + y_test.iloc[i])
print("Label hasil prediksi: " + predicted_label)
print([round(i*100, 2) for i in prediction[0]] , "\n")
jakarta news presiden joko widodo menerima kunjung ... Label awal: tajuk utama Label hasil prediksi: tajuk utama [1.41, 1.1, 0.61, 96.3, 0.57]
MODEL2 = {'units':128, 'dropout':0.5, 'Lr':0.01, 'batch-size':128, 'epoch':50}
MAX_NB_WORDS = len(tokenizer.word_index)+1
model2 = Sequential([
Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
SpatialDropout1D(MODEL2['dropout']),
LSTM(MODEL2['units'], dropout=MODEL2['dropout'], recurrent_dropout=MODEL2['dropout'], activation='tanh'),
Dropout(MODEL2['dropout']),
Dense(num_classes, activation='softmax')
])
model2.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 200, 200) 11280200 _________________________________________________________________ spatial_dropout1d (SpatialDr (None, 200, 200) 0 _________________________________________________________________ lstm (LSTM) (None, 128) 168448 _________________________________________________________________ dropout (Dropout) (None, 128) 0 _________________________________________________________________ dense (Dense) (None, 5) 645 ================================================================= Total params: 11,449,293 Trainable params: 11,449,293 Non-trainable params: 0 _________________________________________________________________
model2.compile(loss='categorical_crossentropy',
optimizer=Adam(learning_rate=MODEL2['Lr']),
metrics=['accuracy']
)
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history2 = model2.fit(train_pad, train_encod,
batch_size=MODEL2['batch-size'],
epochs=MODEL2['epoch'],
validation_split=0.1,
verbose=1,
callbacks=[esCallback]
)
history2
Train on 8100 samples, validate on 900 samples Epoch 1/50 8100/8100 [==============================] - 220s 27ms/sample - loss: 1.4692 - accuracy: 0.3202 - val_loss: 1.2958 - val_accuracy: 0.4067 Epoch 2/50 8100/8100 [==============================] - 219s 27ms/sample - loss: 1.2770 - accuracy: 0.4253 - val_loss: 1.2130 - val_accuracy: 0.4400 Epoch 3/50 8100/8100 [==============================] - 238s 29ms/sample - loss: 1.1596 - accuracy: 0.4747 - val_loss: 1.0015 - val_accuracy: 0.6000 Epoch 4/50 8100/8100 [==============================] - 195s 24ms/sample - loss: 0.8796 - accuracy: 0.6349 - val_loss: 0.6694 - val_accuracy: 0.7767 Epoch 5/50 8100/8100 [==============================] - 198s 24ms/sample - loss: 0.6520 - accuracy: 0.7604 - val_loss: 0.4319 - val_accuracy: 0.8767 Epoch 6/50 8100/8100 [==============================] - 200s 25ms/sample - loss: 0.5002 - accuracy: 0.8336 - val_loss: 0.3631 - val_accuracy: 0.9000 Epoch 7/50 8100/8100 [==============================] - 198s 24ms/sample - loss: 0.4120 - accuracy: 0.8731 - val_loss: 0.3491 - val_accuracy: 0.9044 Epoch 8/50 8100/8100 [==============================] - 203s 25ms/sample - loss: 0.3504 - accuracy: 0.8928 - val_loss: 0.3365 - val_accuracy: 0.9022 Epoch 9/50 8100/8100 [==============================] - 198s 24ms/sample - loss: 0.3020 - accuracy: 0.9088 - val_loss: 0.3453 - val_accuracy: 0.9044 Epoch 10/50 8100/8100 [==============================] - 199s 25ms/sample - loss: 0.2675 - accuracy: 0.9212 - val_loss: 0.3779 - val_accuracy: 0.9078 Epoch 11/50 8100/8100 [==============================] - 201s 25ms/sample - loss: 0.2312 - accuracy: 0.9323 - val_loss: 0.3903 - val_accuracy: 0.9144
<tensorflow.python.keras.callbacks.History at 0x24aeccc9198>
score2 = model2.evaluate(test_pad, test_encod,
batch_size=MODEL2['batch-size'], verbose=2)
print('Test loss:', score2[0])
print('Test accuracy:', score2[1])
1000/1 - 5s - loss: 0.5143 - accuracy: 0.8930 Test loss: 0.47312470149993896 Test accuracy: 0.893
import matplotlib.pyplot as plt
acc = history2.history['accuracy']
val_acc = history2.history['val_accuracy']
loss = history2.history['loss']
val_loss = history2.history['val_loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
text_labels = encoder.classes_
y_pred = model2.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: precision recall f1-score support hiburan 0.85 0.77 0.81 202 olahraga 0.97 0.99 0.98 191 showbiz 0.84 0.90 0.87 193 tajuk utama 0.84 0.87 0.86 187 teknologi 0.95 0.93 0.94 227 accuracy 0.89 1000 macro avg 0.89 0.89 0.89 1000 weighted avg 0.89 0.89 0.89 1000
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
(<Figure size 432x432 with 1 Axes>, <matplotlib.axes._subplots.AxesSubplot at 0x24a8f0e6710>)
MODEL4 = {'units':128, 'dropout':0.5, 'Lr':0.001, 'batch-size':128, 'epoch':50}
MODEL4
{'units': 128, 'dropout': 0.5, 'Lr': 0.001, 'batch-size': 128, 'epoch': 50}
MAX_NB_WORDS = len(tokenizer.word_index)+1
model4 = Sequential([
Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
SpatialDropout1D(MODEL4['dropout']),
LSTM(MODEL4['units'], dropout=MODEL4['dropout'], recurrent_dropout=MODEL4['dropout'], activation='tanh'),
Dropout(MODEL4['dropout']),
Dense(num_classes, activation='softmax')
])
model4.summary()
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (None, 200, 200) 11280200 _________________________________________________________________ spatial_dropout1d_1 (Spatial (None, 200, 200) 0 _________________________________________________________________ lstm_1 (LSTM) (None, 128) 168448 _________________________________________________________________ dropout_1 (Dropout) (None, 128) 0 _________________________________________________________________ dense_1 (Dense) (None, 5) 645 ================================================================= Total params: 11,449,293 Trainable params: 11,449,293 Non-trainable params: 0 _________________________________________________________________
model4.compile(loss='categorical_crossentropy',
optimizer=Adam(learning_rate=MODEL4['Lr']),
metrics=['accuracy']
)
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history4 = model4.fit(train_pad, train_encod,
batch_size=MODEL4['batch-size'],
epochs=MODEL4['epoch'],
validation_split=0.1,
verbose=1,
callbacks=[esCallback]
)
history4
Train on 8100 samples, validate on 900 samples Epoch 1/50 8100/8100 [==============================] - 195s 24ms/sample - loss: 1.5387 - accuracy: 0.2796 - val_loss: 1.3291 - val_accuracy: 0.3444 Epoch 2/50 8100/8100 [==============================] - 217s 27ms/sample - loss: 1.3621 - accuracy: 0.3685 - val_loss: 1.2854 - val_accuracy: 0.4144 Epoch 3/50 8100/8100 [==============================] - 203s 25ms/sample - loss: 1.3113 - accuracy: 0.3993 - val_loss: 1.2836 - val_accuracy: 0.4033 Epoch 4/50 8100/8100 [==============================] - 240s 30ms/sample - loss: 1.3006 - accuracy: 0.4221 - val_loss: 1.2304 - val_accuracy: 0.4222 Epoch 5/50 8100/8100 [==============================] - 201s 25ms/sample - loss: 1.2420 - accuracy: 0.4559 - val_loss: 1.1208 - val_accuracy: 0.4856 Epoch 6/50 8100/8100 [==============================] - 200s 25ms/sample - loss: 1.1096 - accuracy: 0.5389 - val_loss: 0.9832 - val_accuracy: 0.5956 Epoch 7/50 8100/8100 [==============================] - 248s 31ms/sample - loss: 1.0261 - accuracy: 0.5790 - val_loss: 0.9041 - val_accuracy: 0.6189 Epoch 8/50 8100/8100 [==============================] - 247s 30ms/sample - loss: 0.9798 - accuracy: 0.5984 - val_loss: 0.8874 - val_accuracy: 0.6267 Epoch 9/50 8100/8100 [==============================] - 202s 25ms/sample - loss: 0.9108 - accuracy: 0.6328 - val_loss: 0.8676 - val_accuracy: 0.6222 Epoch 10/50 8100/8100 [==============================] - 206s 25ms/sample - loss: 0.8689 - accuracy: 0.6407 - val_loss: 0.8545 - val_accuracy: 0.6278 Epoch 11/50 8100/8100 [==============================] - 204s 25ms/sample - loss: 0.8775 - accuracy: 0.6372 - val_loss: 0.7987 - val_accuracy: 0.6744 Epoch 12/50 8100/8100 [==============================] - 203s 25ms/sample - loss: 0.8300 - accuracy: 0.6662 - val_loss: 0.6321 - val_accuracy: 0.8033 Epoch 13/50 8100/8100 [==============================] - 215s 26ms/sample - loss: 0.7417 - accuracy: 0.7286 - val_loss: 0.5899 - val_accuracy: 0.7967 Epoch 14/50 8100/8100 [==============================] - 209s 26ms/sample - loss: 0.7310 - accuracy: 0.7354 - val_loss: 0.6281 - val_accuracy: 0.7889 Epoch 15/50 8100/8100 [==============================] - 213s 26ms/sample - loss: 0.7454 - accuracy: 0.7300 - val_loss: 0.5219 - val_accuracy: 0.8644 Epoch 16/50 8100/8100 [==============================] - 206s 25ms/sample - loss: 0.7185 - accuracy: 0.7602 - val_loss: 0.5529 - val_accuracy: 0.8389 Epoch 17/50 8100/8100 [==============================] - 201s 25ms/sample - loss: 0.7218 - accuracy: 0.7577 - val_loss: 0.5118 - val_accuracy: 0.8656 Epoch 18/50 8100/8100 [==============================] - 200s 25ms/sample - loss: 0.6211 - accuracy: 0.7974 - val_loss: 0.4528 - val_accuracy: 0.8711 Epoch 19/50 8100/8100 [==============================] - 201s 25ms/sample - loss: 0.5522 - accuracy: 0.8347 - val_loss: 0.4164 - val_accuracy: 0.8889 Epoch 20/50 8100/8100 [==============================] - 199s 25ms/sample - loss: 0.5333 - accuracy: 0.8383 - val_loss: 0.3924 - val_accuracy: 0.8978 Epoch 21/50 8100/8100 [==============================] - 202s 25ms/sample - loss: 0.5096 - accuracy: 0.8475 - val_loss: 0.3777 - val_accuracy: 0.9033 Epoch 22/50 8100/8100 [==============================] - 208s 26ms/sample - loss: 0.4911 - accuracy: 0.8531 - val_loss: 0.3857 - val_accuracy: 0.8944 Epoch 23/50 8100/8100 [==============================] - 200s 25ms/sample - loss: 0.4823 - accuracy: 0.8505 - val_loss: 0.3595 - val_accuracy: 0.9044 Epoch 24/50 8100/8100 [==============================] - 202s 25ms/sample - loss: 0.4260 - accuracy: 0.8746 - val_loss: 0.3565 - val_accuracy: 0.9067 Epoch 25/50 8100/8100 [==============================] - 200s 25ms/sample - loss: 0.3988 - accuracy: 0.8860 - val_loss: 0.3428 - val_accuracy: 0.9089 Epoch 26/50 8100/8100 [==============================] - 203s 25ms/sample - loss: 0.3843 - accuracy: 0.8860 - val_loss: 0.3399 - val_accuracy: 0.9111 Epoch 27/50 8100/8100 [==============================] - 199s 25ms/sample - loss: 0.3685 - accuracy: 0.8974 - val_loss: 0.3354 - val_accuracy: 0.9111 Epoch 28/50 8100/8100 [==============================] - 199s 25ms/sample - loss: 0.3559 - accuracy: 0.8995 - val_loss: 0.3250 - val_accuracy: 0.9122 Epoch 29/50 8100/8100 [==============================] - 200s 25ms/sample - loss: 0.3438 - accuracy: 0.9009 - val_loss: 0.3338 - val_accuracy: 0.9056 Epoch 30/50 8100/8100 [==============================] - 205s 25ms/sample - loss: 0.3379 - accuracy: 0.9037 - val_loss: 0.3234 - val_accuracy: 0.9000 Epoch 31/50 8100/8100 [==============================] - 199s 25ms/sample - loss: 0.3266 - accuracy: 0.9079 - val_loss: 0.3205 - val_accuracy: 0.9111 Epoch 32/50 8100/8100 [==============================] - 203s 25ms/sample - loss: 0.3051 - accuracy: 0.9180 - val_loss: 0.3269 - val_accuracy: 0.9133 Epoch 33/50 8100/8100 [==============================] - 201s 25ms/sample - loss: 0.2899 - accuracy: 0.9194 - val_loss: 0.3338 - val_accuracy: 0.9111 Epoch 34/50 8100/8100 [==============================] - 198s 24ms/sample - loss: 0.2843 - accuracy: 0.9219 - val_loss: 0.3160 - val_accuracy: 0.9156 Epoch 35/50 8100/8100 [==============================] - 209s 26ms/sample - loss: 0.2524 - accuracy: 0.9311 - val_loss: 0.3330 - val_accuracy: 0.9156 Epoch 36/50 8100/8100 [==============================] - 199s 25ms/sample - loss: 0.2350 - accuracy: 0.9411 - val_loss: 0.3210 - val_accuracy: 0.9200 Epoch 37/50 8100/8100 [==============================] - 199s 25ms/sample - loss: 0.2930 - accuracy: 0.9257 - val_loss: 0.3604 - val_accuracy: 0.9067
<tensorflow.python.keras.callbacks.History at 0x24a90ad7748>
score4 = model4.evaluate(test_pad, test_encod,
batch_size=MODEL4['batch-size'], verbose=2)
print('Test loss:', score4[0])
print('Test accuracy:', score4[1])
1000/1 - 5s - loss: 0.4530 - accuracy: 0.8910 Test loss: 0.42583607721328737 Test accuracy: 0.891
import matplotlib.pyplot as plt
acc = history4.history['accuracy']
val_acc = history4.history['val_accuracy']
loss = history4.history['loss']
val_loss = history4.history['val_loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
text_labels = encoder.classes_
y_pred = model4.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: precision recall f1-score support hiburan 0.79 0.83 0.81 202 olahraga 0.97 0.98 0.98 191 showbiz 0.88 0.90 0.89 193 tajuk utama 0.83 0.88 0.85 187 teknologi 0.98 0.87 0.92 227 accuracy 0.89 1000 macro avg 0.89 0.89 0.89 1000 weighted avg 0.90 0.89 0.89 1000
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
(<Figure size 432x432 with 1 Axes>, <matplotlib.axes._subplots.AxesSubplot at 0x24a9cbad908>)
# Pembagian Datasets 70:30
MODEL7 = {'units':128, 'dropout':0.5, 'Lr':0.001, 'batch-size':128, 'epoch':50}
MODEL7
{'units': 128, 'dropout': 0.5, 'Lr': 0.001, 'batch-size': 128, 'epoch': 50}
MAX_NB_WORDS = len(tokenizer.word_index)+1
model7 = Sequential([
Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
SpatialDropout1D(MODEL7['dropout']),
LSTM(MODEL7['units'], dropout=MODEL7['dropout'], recurrent_dropout=MODEL7['dropout'], activation='tanh'),
Dropout(MODEL7['dropout']),
Dense(num_classes, activation='softmax')
])
model7.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 200, 200) 12833800 _________________________________________________________________ spatial_dropout1d (SpatialDr (None, 200, 200) 0 _________________________________________________________________ lstm (LSTM) (None, 128) 168448 _________________________________________________________________ dropout (Dropout) (None, 128) 0 _________________________________________________________________ dense (Dense) (None, 5) 645 ================================================================= Total params: 13,002,893 Trainable params: 13,002,893 Non-trainable params: 0 _________________________________________________________________
model7.compile(loss='categorical_crossentropy',
optimizer=Adam(learning_rate=MODEL7['Lr']),
metrics=['accuracy']
)
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history7 = model7.fit(train_pad, train_encod,
batch_size=MODEL7['batch-size'],
epochs=MODEL7['epoch'],
validation_split=0.1,
verbose=1,
callbacks=[esCallback]
)
history7
Train on 6300 samples, validate on 700 samples Epoch 1/50 6300/6300 [==============================] - 141s 22ms/sample - loss: 1.5600 - accuracy: 0.2692 - val_loss: 1.3414 - val_accuracy: 0.3543 Epoch 2/50 6300/6300 [==============================] - 146s 23ms/sample - loss: 1.4482 - accuracy: 0.3479 - val_loss: 1.3195 - val_accuracy: 0.3700 Epoch 3/50 6300/6300 [==============================] - 156s 25ms/sample - loss: 1.3575 - accuracy: 0.3778 - val_loss: 1.2755 - val_accuracy: 0.3743 Epoch 4/50 6300/6300 [==============================] - 156s 25ms/sample - loss: 1.3123 - accuracy: 0.3994 - val_loss: 1.1706 - val_accuracy: 0.5386 Epoch 5/50 6300/6300 [==============================] - 159s 25ms/sample - loss: 1.2719 - accuracy: 0.4498 - val_loss: 1.1580 - val_accuracy: 0.5271 Epoch 6/50 6300/6300 [==============================] - 159s 25ms/sample - loss: 1.2186 - accuracy: 0.4813 - val_loss: 1.0657 - val_accuracy: 0.5557 Epoch 7/50 6300/6300 [==============================] - 175s 28ms/sample - loss: 1.1794 - accuracy: 0.5076 - val_loss: 1.0112 - val_accuracy: 0.6071 Epoch 8/50 6300/6300 [==============================] - 165s 26ms/sample - loss: 1.1136 - accuracy: 0.5446 - val_loss: 0.9531 - val_accuracy: 0.6014 Epoch 9/50 6300/6300 [==============================] - 171s 27ms/sample - loss: 1.0233 - accuracy: 0.5803 - val_loss: 0.9087 - val_accuracy: 0.6129 Epoch 10/50 6300/6300 [==============================] - 180s 29ms/sample - loss: 1.0037 - accuracy: 0.5948 - val_loss: 0.9252 - val_accuracy: 0.6186 Epoch 11/50 6300/6300 [==============================] - 164s 26ms/sample - loss: 0.9599 - accuracy: 0.6087 - val_loss: 0.8704 - val_accuracy: 0.6257 Epoch 12/50 6300/6300 [==============================] - 172s 27ms/sample - loss: 0.9051 - accuracy: 0.6221 - val_loss: 0.8552 - val_accuracy: 0.6271 Epoch 13/50 6300/6300 [==============================] - 165s 26ms/sample - loss: 0.9043 - accuracy: 0.6229 - val_loss: 0.8252 - val_accuracy: 0.6371 Epoch 14/50 6300/6300 [==============================] - 214s 34ms/sample - loss: 0.8730 - accuracy: 0.6408 - val_loss: 0.8445 - val_accuracy: 0.6329 Epoch 15/50 6300/6300 [==============================] - 158s 25ms/sample - loss: 0.8921 - accuracy: 0.6460 - val_loss: 0.9585 - val_accuracy: 0.6357 Epoch 16/50 6300/6300 [==============================] - 151s 24ms/sample - loss: 0.8818 - accuracy: 0.6463 - val_loss: 0.8213 - val_accuracy: 0.6514 Epoch 17/50 6300/6300 [==============================] - 160s 25ms/sample - loss: 0.8303 - accuracy: 0.6633 - val_loss: 0.8147 - val_accuracy: 0.6414 Epoch 18/50 6300/6300 [==============================] - 168s 27ms/sample - loss: 0.8139 - accuracy: 0.6552 - val_loss: 0.8035 - val_accuracy: 0.6471 Epoch 19/50 6300/6300 [==============================] - 160s 25ms/sample - loss: 0.7886 - accuracy: 0.6694 - val_loss: 0.8083 - val_accuracy: 0.6514 Epoch 20/50 6300/6300 [==============================] - 160s 25ms/sample - loss: 0.7839 - accuracy: 0.6870 - val_loss: 0.6593 - val_accuracy: 0.7800 Epoch 21/50 6300/6300 [==============================] - 153s 24ms/sample - loss: 0.7853 - accuracy: 0.6910 - val_loss: 0.7030 - val_accuracy: 0.7314 Epoch 22/50 6300/6300 [==============================] - 162s 26ms/sample - loss: 0.7441 - accuracy: 0.7083 - val_loss: 0.6170 - val_accuracy: 0.7971 Epoch 23/50 6300/6300 [==============================] - 149s 24ms/sample - loss: 0.7338 - accuracy: 0.7297 - val_loss: 0.6490 - val_accuracy: 0.7571 Epoch 24/50 6300/6300 [==============================] - 149s 24ms/sample - loss: 0.6995 - accuracy: 0.7492 - val_loss: 0.6165 - val_accuracy: 0.8071 Epoch 25/50 6300/6300 [==============================] - 153s 24ms/sample - loss: 0.6659 - accuracy: 0.7630 - val_loss: 0.5904 - val_accuracy: 0.8214 Epoch 26/50 6300/6300 [==============================] - 151s 24ms/sample - loss: 0.6304 - accuracy: 0.7790 - val_loss: 0.7150 - val_accuracy: 0.7571 Epoch 27/50 6300/6300 [==============================] - 148s 24ms/sample - loss: 0.7118 - accuracy: 0.7470 - val_loss: 0.6387 - val_accuracy: 0.7500 Epoch 28/50 6300/6300 [==============================] - 147s 23ms/sample - loss: 0.7015 - accuracy: 0.7494 - val_loss: 0.5983 - val_accuracy: 0.8214
<tensorflow.python.keras.callbacks.History at 0x2aea4b6e4e0>
score7 = model7.evaluate(test_pad, test_encod,
batch_size=MODEL7['batch-size'], verbose=2)
print('Test loss:', score7[0])
print('Test accuracy:', score7[1])
3000/1 - 15s - loss: 0.6587 - accuracy: 0.7910 Test loss: 0.6494710607528686 Test accuracy: 0.791
import matplotlib.pyplot as plt
acc = history7.history['accuracy']
val_acc = history7.history['val_accuracy']
loss = history7.history['loss']
val_loss = history7.history['val_loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
text_labels = encoder.classes_
y_pred = model7.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: precision recall f1-score support hiburan 0.87 0.61 0.72 613 olahraga 0.97 0.92 0.95 603 showbiz 0.75 0.61 0.67 590 tajuk utama 0.56 0.93 0.70 564 teknologi 0.95 0.88 0.92 630 accuracy 0.79 3000 macro avg 0.82 0.79 0.79 3000 weighted avg 0.83 0.79 0.79 3000
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
(<Figure size 432x432 with 1 Axes>, <matplotlib.axes._subplots.AxesSubplot at 0x2aead6e86a0>)
# Pembagian Datasets 80:20
MODEL8 = {'units':128, 'dropout':0.5, 'Lr':0.001, 'batch-size':128, 'epoch':50}
MODEL8
{'units': 128, 'dropout': 0.5, 'Lr': 0.001, 'batch-size': 128, 'epoch': 50}
MAX_NB_WORDS = len(tokenizer.word_index)+1
model8 = Sequential([
Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_pad.shape[1]),
SpatialDropout1D(MODEL8['dropout']),
LSTM(MODEL8['units'], dropout=MODEL8['dropout'], recurrent_dropout=MODEL8['dropout'], activation='tanh'),
Dropout(MODEL8['dropout']),
Dense(num_classes, activation='softmax')
])
model8.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 200, 200) 13675800 _________________________________________________________________ spatial_dropout1d (SpatialDr (None, 200, 200) 0 _________________________________________________________________ lstm (LSTM) (None, 128) 168448 _________________________________________________________________ dropout (Dropout) (None, 128) 0 _________________________________________________________________ dense (Dense) (None, 5) 645 ================================================================= Total params: 13,844,893 Trainable params: 13,844,893 Non-trainable params: 0 _________________________________________________________________
model8.compile(loss='categorical_crossentropy',
optimizer=Adam(learning_rate=MODEL8['Lr']),
metrics=['accuracy']
)
esCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history8 = model8.fit(train_pad, train_encod,
batch_size=MODEL8['batch-size'],
epochs=MODEL8['epoch'],
validation_split=0.1,
verbose=1,
callbacks=[esCallback]
)
history8
Train on 7200 samples, validate on 800 samples Epoch 1/50 7200/7200 [==============================] - 150s 21ms/sample - loss: 1.5368 - accuracy: 0.2719 - val_loss: 1.3265 - val_accuracy: 0.3675 Epoch 2/50 7200/7200 [==============================] - 152s 21ms/sample - loss: 1.3534 - accuracy: 0.3676 - val_loss: 1.3147 - val_accuracy: 0.3837 Epoch 3/50 7200/7200 [==============================] - 156s 22ms/sample - loss: 1.3300 - accuracy: 0.3789 - val_loss: 1.2552 - val_accuracy: 0.4238 Epoch 4/50 7200/7200 [==============================] - 154s 21ms/sample - loss: 1.2847 - accuracy: 0.4203 - val_loss: 1.2063 - val_accuracy: 0.4363 Epoch 5/50 7200/7200 [==============================] - 160s 22ms/sample - loss: 1.2374 - accuracy: 0.4556 - val_loss: 1.1674 - val_accuracy: 0.4600 Epoch 6/50 7200/7200 [==============================] - 162s 22ms/sample - loss: 1.1778 - accuracy: 0.4926 - val_loss: 1.1220 - val_accuracy: 0.5088 Epoch 7/50 7200/7200 [==============================] - 163s 23ms/sample - loss: 1.1413 - accuracy: 0.5167 - val_loss: 0.9792 - val_accuracy: 0.5950 Epoch 8/50 7200/7200 [==============================] - 166s 23ms/sample - loss: 1.0838 - accuracy: 0.5681 - val_loss: 0.8655 - val_accuracy: 0.6350 Epoch 9/50 7200/7200 [==============================] - 165s 23ms/sample - loss: 0.9854 - accuracy: 0.5890 - val_loss: 0.8593 - val_accuracy: 0.6200 Epoch 10/50 7200/7200 [==============================] - 164s 23ms/sample - loss: 0.8914 - accuracy: 0.6294 - val_loss: 0.8244 - val_accuracy: 0.6513 Epoch 11/50 7200/7200 [==============================] - 172s 24ms/sample - loss: 0.9085 - accuracy: 0.6199 - val_loss: 0.8427 - val_accuracy: 0.6363 Epoch 12/50 7200/7200 [==============================] - 168s 23ms/sample - loss: 0.8556 - accuracy: 0.6406 - val_loss: 0.8318 - val_accuracy: 0.6363 Epoch 13/50 7200/7200 [==============================] - 164s 23ms/sample - loss: 0.8174 - accuracy: 0.6447 - val_loss: 0.8116 - val_accuracy: 0.6562 Epoch 14/50 7200/7200 [==============================] - 168s 23ms/sample - loss: 0.7722 - accuracy: 0.6658 - val_loss: 0.7700 - val_accuracy: 0.6675 Epoch 15/50 7200/7200 [==============================] - 164s 23ms/sample - loss: 0.7747 - accuracy: 0.6686 - val_loss: 0.7495 - val_accuracy: 0.6750 Epoch 16/50 7200/7200 [==============================] - 162s 22ms/sample - loss: 0.7518 - accuracy: 0.6726 - val_loss: 0.7579 - val_accuracy: 0.6637 Epoch 17/50 7200/7200 [==============================] - 167s 23ms/sample - loss: 0.7360 - accuracy: 0.6746 - val_loss: 0.7768 - val_accuracy: 0.6637 Epoch 18/50 7200/7200 [==============================] - 168s 23ms/sample - loss: 0.7156 - accuracy: 0.6953 - val_loss: 0.6634 - val_accuracy: 0.7225 Epoch 19/50 7200/7200 [==============================] - 166s 23ms/sample - loss: 0.6522 - accuracy: 0.7357 - val_loss: 0.5419 - val_accuracy: 0.8250 Epoch 20/50 7200/7200 [==============================] - 175s 24ms/sample - loss: 0.6047 - accuracy: 0.7771 - val_loss: 0.5249 - val_accuracy: 0.8100 Epoch 21/50 7200/7200 [==============================] - 170s 24ms/sample - loss: 0.5678 - accuracy: 0.8043 - val_loss: 0.4025 - val_accuracy: 0.9150 Epoch 22/50 7200/7200 [==============================] - 161s 22ms/sample - loss: 0.5569 - accuracy: 0.8226 - val_loss: 0.4411 - val_accuracy: 0.8950 Epoch 23/50 7200/7200 [==============================] - 163s 23ms/sample - loss: 0.5474 - accuracy: 0.8235 - val_loss: 0.4338 - val_accuracy: 0.9013 Epoch 24/50 7200/7200 [==============================] - 163s 23ms/sample - loss: 0.5426 - accuracy: 0.8343 - val_loss: 0.4076 - val_accuracy: 0.9075
<tensorflow.python.keras.callbacks.History at 0x1e83b4d5a58>
score8 = model8.evaluate(test_pad, test_encod,
batch_size=MODEL8['batch-size'], verbose=2)
print('Test loss:', score8[0])
print('Test accuracy:', score8[1])
2000/1 - 12s - loss: 0.4838 - accuracy: 0.8830 Test loss: 0.49501818656921387 Test accuracy: 0.883
import matplotlib.pyplot as plt
acc = history8.history['accuracy']
val_acc = history8.history['val_accuracy']
loss = history8.history['loss']
val_loss = history8.history['val_loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
text_labels = encoder.classes_
y_pred = model8.predict(test_pad)
y_pred_clases = [text_labels[np.argmax(elemen)] for elemen in y_pred]
Y_test = text_labels[np.argmax(test_encod, axis=1)]
print("Classification Report: \n", classification_report(y_test.values, y_pred_clases))
Classification Report: precision recall f1-score support hiburan 0.83 0.76 0.79 408 olahraga 0.96 0.95 0.96 397 showbiz 0.80 0.92 0.86 382 tajuk utama 0.85 0.87 0.86 382 teknologi 0.97 0.91 0.94 431 accuracy 0.88 2000 macro avg 0.88 0.88 0.88 2000 weighted avg 0.89 0.88 0.88 2000
met = confusion_matrix(y_test.values, y_pred_clases)
plot_confusion_matrix(conf_mat=met, figsize=(6, 6), class_names=text_labels)
(<Figure size 432x432 with 1 Axes>, <matplotlib.axes._subplots.AxesSubplot at 0x1e834334ac8>)