-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added some cityes comparison algorithms
- Loading branch information
1 parent
b3fa377
commit 38f516f
Showing
9 changed files
with
204 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,36 @@ | ||
package in.arod.addressNormalizer; | ||
|
||
import in.arod.addressNormalizer.model.street.OriginalStreetType; | ||
import in.arod.addressNormalizer.repository.OriginalAddressRepository; | ||
import in.arod.addressNormalizer.repository.street.OriginalStreetTypeRepository; | ||
import in.arod.addressNormalizer.service.impl.PairsAlgorithm; | ||
import lombok.RequiredArgsConstructor; | ||
import org.springframework.context.event.ContextStartedEvent; | ||
import org.springframework.context.event.EventListener; | ||
import org.springframework.stereotype.Service; | ||
|
||
import java.util.Set; | ||
import java.util.List; | ||
|
||
@Service | ||
@RequiredArgsConstructor | ||
public class MyService { | ||
private final OriginalAddressRepository originalAddressRepository; | ||
private final OriginalStreetTypeRepository originalStreetTypeRepository; | ||
private final PairsAlgorithm pairsAlgoritm; | ||
|
||
|
||
@EventListener | ||
public void test(ContextStartedEvent contextStartedEvent) { | ||
Set<String> streetTypes = originalAddressRepository.findUniqueStreetTypes(); | ||
streetTypes.forEach(s -> { | ||
OriginalStreetType originalStreetType = new OriginalStreetType(); | ||
originalStreetType.setTitle(s); | ||
originalStreetTypeRepository.save(originalStreetType); | ||
}); | ||
List<String> streetTypes = originalAddressRepository.findUniqueCities(); | ||
for (int i = 0; i < streetTypes.size(); i++) { | ||
for (int j = i + 1; j < streetTypes.size(); j++) { | ||
try { | ||
double v = pairsAlgoritm.compare(streetTypes.get(i), streetTypes.get(j)); | ||
if (v > 0.80) | ||
System.out.printf("%30s %30s %f \n", streetTypes.get(i), streetTypes.get(j), v); | ||
} catch (Exception e) { | ||
// System.err.println(streetTypes.get(i) + " " + streetTypes.get(j)); | ||
} | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 5 additions & 0 deletions
5
src/main/java/in/arod/addressNormalizer/service/Algorithm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
package in.arod.addressNormalizer.service; | ||
|
||
public interface Algorithm { | ||
float compare(String s1, String s2); | ||
} |
13 changes: 13 additions & 0 deletions
13
src/main/java/in/arod/addressNormalizer/service/impl/JavaLangAlgorithm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package in.arod.addressNormalizer.service.impl; | ||
|
||
import in.arod.addressNormalizer.service.Algorithm; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.springframework.stereotype.Service; | ||
|
||
@Service | ||
public class JavaLangAlgorithm implements Algorithm { | ||
@Override | ||
public float compare(String s1, String s2) { | ||
return (float) StringUtils.getJaroWinklerDistance(s1, s2); | ||
} | ||
} |
40 changes: 40 additions & 0 deletions
40
src/main/java/in/arod/addressNormalizer/service/impl/LevenshatinAlgorithm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
package in.arod.addressNormalizer.service.impl; | ||
|
||
import in.arod.addressNormalizer.service.Algorithm; | ||
import org.springframework.stereotype.Service; | ||
|
||
@Service | ||
public class LevenshatinAlgorithm implements Algorithm { | ||
@Override | ||
public float compare(String s1, String s2) { | ||
return levenstain(s1, s2); | ||
} | ||
public int levenstain(String str1, String str2) { | ||
int[] Di_1 = new int[str2.length() + 1]; | ||
int[] Di = new int[str2.length() + 1]; | ||
|
||
for (int j = 0; j <= str2.length(); j++) { | ||
Di[j] = j; // (i == 0) | ||
} | ||
|
||
for (int i = 1; i <= str1.length(); i++) { | ||
System.arraycopy(Di, 0, Di_1, 0, Di_1.length); | ||
|
||
Di[0] = i; // (j == 0) | ||
for (int j = 1; j <= str2.length(); j++) { | ||
int cost = (str1.charAt(i - 1) != str2.charAt(j - 1)) ? 1 : 0; | ||
Di[j] = min( | ||
Di_1[j] + 1, | ||
Di[j - 1] + 1, | ||
Di_1[j - 1] + cost | ||
); | ||
} | ||
} | ||
|
||
return Di[Di.length - 1]; | ||
} | ||
|
||
private int min(int n1, int n2, int n3) { | ||
return Math.min(Math.min(n1, n2), n3); | ||
} | ||
} |
49 changes: 49 additions & 0 deletions
49
src/main/java/in/arod/addressNormalizer/service/impl/LevenshatinCaseIgnoreAlgorithm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package in.arod.addressNormalizer.service.impl; | ||
|
||
import in.arod.addressNormalizer.service.Algorithm; | ||
import org.springframework.stereotype.Service; | ||
|
||
@Service | ||
public class LevenshatinCaseIgnoreAlgorithm implements Algorithm { | ||
@Override | ||
public float compare(String stringOne, String stringTwo) { | ||
// if we want to ignore case sensitivity, lower case the strings | ||
|
||
stringOne = stringOne.toLowerCase(); | ||
stringTwo = stringTwo.toLowerCase(); | ||
|
||
|
||
// store length | ||
int m = stringOne.length(); | ||
int n = stringTwo.length(); | ||
|
||
// matrix to store differences | ||
int[][] deltaM = new int[m + 1][n + 1]; | ||
|
||
for (int i = 1; i <= m; i++) { | ||
deltaM[i][0] = i; | ||
} | ||
|
||
for (int j = 1; j <= n; j++) { | ||
deltaM[0][j] = j; | ||
} | ||
|
||
for (int j = 1; j <= n; j++) { | ||
for (int i = 1; i <= m; i++) { | ||
if (stringOne.charAt(i - 1) == stringTwo.charAt(j - 1)) { | ||
deltaM[i][j] = deltaM[i - 1][j - 1]; | ||
} else { | ||
deltaM[i][j] = Math.min( | ||
deltaM[i - 1][j] + 1, | ||
Math.min( | ||
deltaM[i][j - 1] + 1, | ||
deltaM[i - 1][j - 1] + 1 | ||
) | ||
); | ||
} | ||
} | ||
} | ||
|
||
return deltaM[m][n]; | ||
} | ||
} |
61 changes: 61 additions & 0 deletions
61
src/main/java/in/arod/addressNormalizer/service/impl/PairsAlgorithm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
package in.arod.addressNormalizer.service.impl; | ||
|
||
import in.arod.addressNormalizer.service.Algorithm; | ||
import org.springframework.stereotype.Service; | ||
|
||
import java.util.ArrayList; | ||
|
||
@Service | ||
public class PairsAlgorithm implements Algorithm { | ||
@Override | ||
public float compare(String str1, String str2) { | ||
ArrayList pairs1 = wordLetterPairs(str1.toUpperCase()); | ||
ArrayList pairs2 = wordLetterPairs(str2.toUpperCase()); | ||
int intersection = 0; | ||
int union = pairs1.size() + pairs2.size(); | ||
for (int i = 0; i < pairs1.size(); i++) { | ||
Object pair1 = pairs1.get(i); | ||
for (int j = 0; j < pairs2.size(); j++) { | ||
Object pair2 = pairs2.get(j); | ||
if (pair1.equals(pair2)) { | ||
intersection++; | ||
pairs2.remove(j); | ||
break; | ||
} | ||
} | ||
} | ||
return (float) ((2.0 * intersection) / union); | ||
} | ||
|
||
/** | ||
* @return an array of adjacent letter pairs contained in the input string | ||
*/ | ||
private String[] letterPairs(String str) { | ||
int numPairs = str.length() - 1; | ||
String[] pairs = new String[numPairs]; | ||
for (int i = 0; i < numPairs; i++) { | ||
pairs[i] = str.substring(i, i + 2); | ||
|
||
} | ||
return pairs; | ||
} | ||
|
||
|
||
/** | ||
* @return an ArrayList of 2-character Strings. | ||
*/ | ||
private ArrayList wordLetterPairs(String str) { | ||
ArrayList allPairs = new ArrayList(); | ||
// Tokenize the string and put the tokens/words into an array | ||
String[] words = str.split("\\s"); | ||
// For each word | ||
for (int w = 0; w < words.length; w++) { | ||
// Find the pairs of characters | ||
String[] pairsInWord = letterPairs(words[w]); | ||
for (int p = 0; p < pairsInWord.length; p++) { | ||
allPairs.add(pairsInWord[p]); | ||
} | ||
} | ||
return allPairs; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters