Skip to content

Commit

Permalink
Adding Stop Words - A Text Mining Problem
Browse files Browse the repository at this point in the history
  • Loading branch information
Jarvis@LNMIIT committed Sep 17, 2014
1 parent eb542e4 commit ad62de0
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 0 deletions.
88 changes: 88 additions & 0 deletions Stop_Words/aayushKumarJarvis/StopWords.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import java.util.*;
import java.io.*;

public class StopWords {

private static String OUTPUT_FILE = "YOUR_FILE_LOCATION";

public static Boolean searchForStopWord(String word, List<String> textForCheck) {

int indexOfWord = Collections.binarySearch(textForCheck,word);

if(indexOfWord < 0)
return false;
else
return true;
}

public static List<String> readStopWords(String stopWordsFilename) throws Exception {

FileInputStream fStream = new FileInputStream(stopWordsFilename);

DataInputStream dataStreamObject = new DataInputStream(fStream);
BufferedReader objectForBuffer = new BufferedReader(new InputStreamReader(dataStreamObject));

String strLine;
String oneLinerString = "";

while ((strLine = objectForBuffer.readLine()) != null) {
strLine.trim();
oneLinerString = oneLinerString + "," + strLine;
}

List<String> tokenizedList = Arrays.asList(oneLinerString.split(","));
fStream.close();

return tokenizedList;
}

public static void removeStopWords(String textFilename, List<String> stopWords) {

try {
FileOutputStream outputStream = new FileOutputStream(OUTPUT_FILE);
PrintStream outputFileWriter = new PrintStream(outputStream);

FileInputStream fStream = new FileInputStream(textFilename);
DataInputStream dataStreamObject = new DataInputStream(fStream);
BufferedReader objectForBuffer = new BufferedReader(new InputStreamReader(dataStreamObject));

String strLine;

while ((strLine = objectForBuffer.readLine()) != null) {

boolean flag = false;

List<String> tokenizedList = Arrays.asList(strLine.split("([^a-zA-z0-9])"));

for(int i=0;i<tokenizedList.size();i++) {

flag = searchForStopWord(tokenizedList.get(i), stopWords);

if (!flag)
outputFileWriter.print(tokenizedList.get(i)+" ");

flag = false;
}

outputFileWriter.print("\n");
}
}

catch(Exception e){
System.err.println(e.getMessage());
}
}

public static void main(String[] arg) throws Exception {

Scanner keyboard = new Scanner(System.in);

System.out.print("Please type the stop words file name: ");
List<String> stopWords = readStopWords(keyboard.next());

System.out.print("Please type the text file name: ");
removeStopWords(keyboard.next(), stopWords);

}
}

43 changes: 43 additions & 0 deletions Stop_Words/aayushKumarJarvis/TestStopWords.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
mport org.junit.Test;
import java.util.Arrays;
import java.util.List;

import static org.junit.Assert.*;

public class TestStopWords {

@Test
public void testSearchForStopWord() {

String testWord1 = "Aayush";
String testWord2 = "Kumar";
String testWord3 = "Srivastava";
String testWord4 = "Random Text";
String testWord5 = "Text";

String[] listOfNames = {"Aayush","Kumar","Srivastava"};
List<String> testString = Arrays.asList(listOfNames); // Converting Array into List of String

assertEquals(StopWords.searchForStopWord(testWord1, testString), true);
assertEquals(StopWords.searchForStopWord(testWord2,testString),true);
assertEquals(StopWords.searchForStopWord(testWord3,testString),true);
assertEquals(StopWords.searchForStopWord(testWord4,testString),false);
assertEquals(StopWords.searchForStopWord(testWord5,testString),false);
}

@Test
public void testReadStopWords() throws Exception {

List<String> tokenizedText = StopWords.readStopWords("FILE_NAME");
System.out.println(tokenizedText);
}

@Test
public void testForRemoveStopWords() throws Exception {

List<String> tokenizedText = StopWords.readStopWords("FILE_NAME");
StopWords.removeStopWords("FILE_NAME",tokenizedText);
}

}

0 comments on commit ad62de0

Please sign in to comment.