forked from kennyledet/Algorithm-Implementations
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding Stop Words - A Text Mining Problem
- Loading branch information
Jarvis@LNMIIT
committed
Sep 17, 2014
1 parent
eb542e4
commit ad62de0
Showing
2 changed files
with
131 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import java.util.*; | ||
import java.io.*; | ||
|
||
public class StopWords { | ||
|
||
private static String OUTPUT_FILE = "YOUR_FILE_LOCATION"; | ||
|
||
public static Boolean searchForStopWord(String word, List<String> textForCheck) { | ||
|
||
int indexOfWord = Collections.binarySearch(textForCheck,word); | ||
|
||
if(indexOfWord < 0) | ||
return false; | ||
else | ||
return true; | ||
} | ||
|
||
public static List<String> readStopWords(String stopWordsFilename) throws Exception { | ||
|
||
FileInputStream fStream = new FileInputStream(stopWordsFilename); | ||
|
||
DataInputStream dataStreamObject = new DataInputStream(fStream); | ||
BufferedReader objectForBuffer = new BufferedReader(new InputStreamReader(dataStreamObject)); | ||
|
||
String strLine; | ||
String oneLinerString = ""; | ||
|
||
while ((strLine = objectForBuffer.readLine()) != null) { | ||
strLine.trim(); | ||
oneLinerString = oneLinerString + "," + strLine; | ||
} | ||
|
||
List<String> tokenizedList = Arrays.asList(oneLinerString.split(",")); | ||
fStream.close(); | ||
|
||
return tokenizedList; | ||
} | ||
|
||
public static void removeStopWords(String textFilename, List<String> stopWords) { | ||
|
||
try { | ||
FileOutputStream outputStream = new FileOutputStream(OUTPUT_FILE); | ||
PrintStream outputFileWriter = new PrintStream(outputStream); | ||
|
||
FileInputStream fStream = new FileInputStream(textFilename); | ||
DataInputStream dataStreamObject = new DataInputStream(fStream); | ||
BufferedReader objectForBuffer = new BufferedReader(new InputStreamReader(dataStreamObject)); | ||
|
||
String strLine; | ||
|
||
while ((strLine = objectForBuffer.readLine()) != null) { | ||
|
||
boolean flag = false; | ||
|
||
List<String> tokenizedList = Arrays.asList(strLine.split("([^a-zA-z0-9])")); | ||
|
||
for(int i=0;i<tokenizedList.size();i++) { | ||
|
||
flag = searchForStopWord(tokenizedList.get(i), stopWords); | ||
|
||
if (!flag) | ||
outputFileWriter.print(tokenizedList.get(i)+" "); | ||
|
||
flag = false; | ||
} | ||
|
||
outputFileWriter.print("\n"); | ||
} | ||
} | ||
|
||
catch(Exception e){ | ||
System.err.println(e.getMessage()); | ||
} | ||
} | ||
|
||
public static void main(String[] arg) throws Exception { | ||
|
||
Scanner keyboard = new Scanner(System.in); | ||
|
||
System.out.print("Please type the stop words file name: "); | ||
List<String> stopWords = readStopWords(keyboard.next()); | ||
|
||
System.out.print("Please type the text file name: "); | ||
removeStopWords(keyboard.next(), stopWords); | ||
|
||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
mport org.junit.Test; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
import static org.junit.Assert.*; | ||
|
||
public class TestStopWords { | ||
|
||
@Test | ||
public void testSearchForStopWord() { | ||
|
||
String testWord1 = "Aayush"; | ||
String testWord2 = "Kumar"; | ||
String testWord3 = "Srivastava"; | ||
String testWord4 = "Random Text"; | ||
String testWord5 = "Text"; | ||
|
||
String[] listOfNames = {"Aayush","Kumar","Srivastava"}; | ||
List<String> testString = Arrays.asList(listOfNames); // Converting Array into List of String | ||
|
||
assertEquals(StopWords.searchForStopWord(testWord1, testString), true); | ||
assertEquals(StopWords.searchForStopWord(testWord2,testString),true); | ||
assertEquals(StopWords.searchForStopWord(testWord3,testString),true); | ||
assertEquals(StopWords.searchForStopWord(testWord4,testString),false); | ||
assertEquals(StopWords.searchForStopWord(testWord5,testString),false); | ||
} | ||
|
||
@Test | ||
public void testReadStopWords() throws Exception { | ||
|
||
List<String> tokenizedText = StopWords.readStopWords("FILE_NAME"); | ||
System.out.println(tokenizedText); | ||
} | ||
|
||
@Test | ||
public void testForRemoveStopWords() throws Exception { | ||
|
||
List<String> tokenizedText = StopWords.readStopWords("FILE_NAME"); | ||
StopWords.removeStopWords("FILE_NAME",tokenizedText); | ||
} | ||
|
||
} | ||
|