Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Samitha Nanayakkara authored and Samitha Nanayakkara committed Jul 29, 2018
0 parents commit 913ee5d
Show file tree
Hide file tree
Showing 13 changed files with 642 additions and 0 deletions.
225 changes: 225 additions & 0 deletions Code/Code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import re,scipy
import numpy as np
import os,io,random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,TensorBoard
from keras.models import load_model
from scipy.sparse import hstack,csr_matrix
from scipy import sparse


# Global list of categories.
categories = ['Local','International'] # All interested categories should be listed here.

# Pre-process data. Here only have removed links from sinhala news data.
def pre_process(folderName,functionType):
path_to_Folder_Parent_Folder = os.path.abspath(__file__ + "/../../")

if(os.path.isfile(path_to_Folder_Parent_Folder+'/'+functionType+'/'+folderName+'/text.txt') and not (os.path.isfile(path_to_Folder_Parent_Folder+'/'+folderName+'/output.txt'))):
inFile = open(path_to_Folder_Parent_Folder+'/'+functionType+'/'+folderName+'/text.txt','r')
outFile = open(path_to_Folder_Parent_Folder+'/'+functionType+'/'+folderName+'/output.txt','w+')
text = inFile.read().splitlines()
for line in text:
outFile.write(re.sub(r'https?:\/\/.*[\r\n]*','',line)+'\n') # Use to remove links from original news articles
outFile.close()
inFile.close()
else:
print('File not found or output file exist')
pass

# Create corpus of words from pre-processed files
def createCorpus(functionType):
if functionType == 'Training':
path_to_Folder_Parent_Folder = os.path.abspath(__file__ + "/../../")
fullCorpus = []
labels = []
for types in categories:
if not os.path.isfile(path_to_Folder_Parent_Folder+'/'+functionType+'/'+types+'/output.txt'):
pre_process(types,functionType) # Create pre_processed output file if it does not exist.

inFile = open(path_to_Folder_Parent_Folder+'/'+functionType+'/'+types+'/output.txt')
text = inFile.read().splitlines()
i=0
while i<len(text):
if types=='Local':
labels.append(1)
else:
labels.append(0)
i += 1
fullCorpus = fullCorpus+ text

combined = list(zip(fullCorpus, labels))
random.shuffle(combined)
fullCorpus[:], labels[:] = zip(*combined)
count = 0
while count < len(labels):
#print(fullCorpus[count]+ ' -!!- '+labels[count]) Use to print corpus and labels
count += 1

#print(len(labels)) Size of the corpus of all categories
return fullCorpus,labels

else:
path_to_Folder_Parent_Folder = os.path.abspath(__file__ + "/../../")
fullCorpus = []
labels = []
for types in categories:
if not os.path.isfile(path_to_Folder_Parent_Folder+'/'+types+'/output.txt'):
pre_process(types,functionType) # Create pre_processed output file if it does not exist.
inFile = open(path_to_Folder_Parent_Folder+'/'+functionType+'/'+types+'/output.txt')
text = inFile.read().splitlines()
i=0
while i<len(text):
if types=='Local':
labels.append(1)
else:
labels.append(0)
i += 1
fullCorpus = fullCorpus+ text

combined = list(zip(fullCorpus, labels))
random.shuffle(combined)
fullCorpus[:], labels[:] = zip(*combined)
count = 0
while count < len(labels):
#print(fullCorpus[count]+ ' -!!- '+labels[count]) Use to print corpus and labels
count += 1

#print(len(labels)) Size of the corpus of all categories
return fullCorpus,labels

# Create bigram (Since n=2 here) and using TFIDF vectorization method transforme data to vector representation. SelectKBest removes all but the k highest scoring features.
def ngramFunc(training_vals,training_labels,test_values,val_values): # n-gram n value
ngram = (1, 2)
topK = 20000
token_Mode = 'word'
min_freq = 2
kwargs = {
'ngram_range': ngram, # Use 1-grams + 2-grams.
'dtype': 'int32',
'strip_accents': 'unicode',
'decode_error': 'replace',
'analyzer': token_Mode, # Split text into word tokens.
'min_df': min_freq,
}
vectorizer = TfidfVectorizer(**kwargs)
training = vectorizer.fit_transform(training_vals)
testing = vectorizer.transform(test_values)
validating = vectorizer.transform(val_values)

selector = SelectKBest(f_classif, k=min(topK, training.shape[1])) # Use to identify best features and use to train model and test model.
selector.fit(training, training_labels)
train = selector.transform(training).astype('float32')
test = selector.transform(testing).astype('float32')
validation = selector.transform(validating).astype('float32')

return train,test, validation

def ngramFuncPrediction(predict_values):
ngram = (1, 2)
trainingFeatureSize = 1196
token= 'word'
min_freq = 1
kwargs = {
'ngram_range': ngram, # Use 1-grams + 2-grams.
'dtype': 'int32',
'strip_accents': 'unicode',
'decode_error': 'replace',
'analyzer': token, # Split text into word tokens.
'min_df': min_freq,
'max_features':trainingFeatureSize,
}
vectorizer = TfidfVectorizer(**kwargs)
predict = vectorizer.fit_transform(predict_values).astype('float32')
firstSize = predict.shape[1]
rep = trainingFeatureSize/firstSize-1
i=0
print(firstSize)
temp = predict
while i<rep:
predict = hstack([predict,temp])
i+=1
calc = abs(predict.shape[1] - trainingFeatureSize)
predict = sparse.lil_matrix(sparse.csr_matrix(predict.tocsr()[:,0:trainingFeatureSize]))
predict[:,firstSize:predict.shape[1]]=pow(10,-100)

print(predict.shape[1])

return predict

# Final method to start training including all other functions.
def training():

training_fullCorpus , training_labels = createCorpus('Training')
test_fullCorpus,test_labels = createCorpus('Testing')
val_fullCorpus,val_labels = createCorpus('Validation')

train , test, validate = ngramFunc(training_fullCorpus,training_labels,test_fullCorpus,val_fullCorpus) # bag-of-words approach

inputShape =train.shape[1:]
print('------------- Input Shape ----------------')
print(inputShape)
adam = Adam(lr=0.0005) # Learning rate for this training.
batchSize= 512 # Batch size
tensorboard = TensorBoard(log_dir="./logs",histogram_freq=0,write_graph=True,write_images=True) # Develop the tensorboard graph. Run tensorboard -logdir=./logs from current directory of this python file using terminal(MacOS/Linux) or CMD(Windows).
earlystop = EarlyStopping(monitor='val_loss',patience=5) # To early stop the model before being over fitted.

# Since this is a categorical classification (Because number of classes > 2) final layer softmax activation and categorical_crossentropy
model = Sequential()
model.add(Dense(64,activation='relu',input_shape=inputShape))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.add(Dropout(0.5))
model.summary()

if len(categories)>2:
model.compile(loss='categorical_crossentropy',optimizer=adam,metrics=['accuracy'])
else:
model.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])


model.fit(train,training_labels,batch_size=batchSize,epochs=5000,callbacks=[tensorboard,earlystop],validation_data=(validate,val_labels))
score, acc = model.evaluate(test,test_labels,batch_size=batchSize)
print('Score: ',score) # Test Score
print('Accuracy: ',acc) # Test Accuracy (With this default configurations, 70.7% of testing accuracy)
print('---------- Training Succesful ----------')
model.save('Sinhala Text Classifier.h5') # Save model as HDF5 format.

def predict(txtfileName):
currentLocation = os.path.abspath(__file__ + "/../")
if not (os.path.isfile(currentLocation+'/Sinhala Text Classifier.h5')):
print('Model is missing under the name of \"Sinhala Text Classifier.h5\". So training starts before predicting')
training()

else:
pass
print('Loading model. Please wait ... ')
model = load_model('Sinhala Text Classifier.h5')
if os.path.abspath(__file__ + "/../../")+'/Predict/'+txtfileName+'.txt':
filePath = os.path.abspath(__file__ + "/../../")+'/Predict/'+txtfileName+'.txt'
inFile = open(filePath,'r')
text = inFile.read().splitlines()
predict = ngramFuncPrediction(text)
print('Prediction starts')
out = model.predict(predict)
out = sum(out) / len(out)
finalOut = (out > 0.5).astype(int)
print(out)
if finalOut[0] == 0:
print('International')
else :
print('Local')
else:
print('Prediction data included file not found')

predict('text')





Binary file added Code/Sinhala Text Classifier.h5
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 2 additions & 0 deletions Predict/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pākistānayē mahā mætivaraṇaya pævætvū badādā (25) miliyana gaṇanak siya chandaya prakāśa kirīmaṭa chanda madhyasthāna veta pæmiṇi atara mætivaraṇa pracaṇḍa kriyā ræsak vārtā viya. kveṭā nagarayē siduvū bōmba pipirīmakin pudgalayan 31 k pamaṇa miya giyaha.
kuvēṭhi samāja mādhya jāla ossē prasiddhiyaṭa patva æti kāntāvak eraṭa sēvaya karana pilipīna gṛha sēvikāvan piḷiban̆da paḷa kaḷa matayakaṭa erehiva dæḍi vivēcana ella vemin tibē.
52 changes: 52 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Sinhala Text Classification

Text classification is currently a popular machine learning application and this repository contains details of results and datasets.


## Folder Structure

Master -- |
|-Code
|-Testing
|-Training
|-Validation
Code folder contains Code.py file which shows the implementation of this classification with data pre processing. Also it contains this trained n-grams based model in HDF5 format.

Testing,Training,Validation Folders contain test,train and validation data respectively in categories. As an example, this Sinhala text classification is a binary classification (Classified based on Local news and International News). So these folders contain Two subfolders to represent those categories and each category contains a text.txt file (UTF-8 Encoded) which contains all news.

E.g:

Testing -- |
|-International - text.txt
|-Local - text.txt
## Details

This model has been trained using n-grams and tfidf vectorization method. It had 76% training accuracy with 70% testing accuracy. Also since it is not possible to train with sinhala fonts, **phonetic representation** used to represent sinhala words.
Under pre-processing, this algorithm only removes URLs from news.

## Requirements

Python 3.x
Tensorflow 1.8
Keras 2.1.x
sklearn
numpy

## Run

To run this python code, from inside the Code folder run below code using terminal

./Code.py
## Re-train

You have to use another set of news under preferred number of categories and retrain the model. Also have to adjust some parameters in `Code.py`.

## Prediction

Here add your prediction text.txt file to `Predict` folder. Please remember to rename the file as text.txt (If you don't want to edit Code.py) and keep it as UTF-8 encoded.
For a test run of prediction, just run `./Code.py` as said in *Run* subsection.
It is a must to have saved HDF5 file under Code folder. If not, program will automatically start to train before predict.
Note that, your prediction may not be very accurate since this model only achieved 70% of accuracy and trained only upto some vocabulary. To get better results, retrain the model with paragraphs of news having your preferred vocabulary.
Loading

0 comments on commit 913ee5d

Please sign in to comment.