Skip to content

Commit

Permalink
Added some cityes comparison algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
rodin-andrei committed Sep 14, 2021
1 parent b3fa377 commit 38f516f
Show file tree
Hide file tree
Showing 9 changed files with 204 additions and 13 deletions.
11 changes: 10 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
Expand Down Expand Up @@ -50,6 +54,11 @@
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.12</version>
</dependency>

</dependencies>
</project>
</project>
24 changes: 16 additions & 8 deletions src/main/java/in/arod/addressNormalizer/MyService.java
Original file line number Diff line number Diff line change
@@ -1,28 +1,36 @@
package in.arod.addressNormalizer;

import in.arod.addressNormalizer.model.street.OriginalStreetType;
import in.arod.addressNormalizer.repository.OriginalAddressRepository;
import in.arod.addressNormalizer.repository.street.OriginalStreetTypeRepository;
import in.arod.addressNormalizer.service.impl.PairsAlgorithm;
import lombok.RequiredArgsConstructor;
import org.springframework.context.event.ContextStartedEvent;
import org.springframework.context.event.EventListener;
import org.springframework.stereotype.Service;

import java.util.Set;
import java.util.List;

@Service
@RequiredArgsConstructor
public class MyService {
private final OriginalAddressRepository originalAddressRepository;
private final OriginalStreetTypeRepository originalStreetTypeRepository;
private final PairsAlgorithm pairsAlgoritm;


@EventListener
public void test(ContextStartedEvent contextStartedEvent) {
Set<String> streetTypes = originalAddressRepository.findUniqueStreetTypes();
streetTypes.forEach(s -> {
OriginalStreetType originalStreetType = new OriginalStreetType();
originalStreetType.setTitle(s);
originalStreetTypeRepository.save(originalStreetType);
});
List<String> streetTypes = originalAddressRepository.findUniqueCities();
for (int i = 0; i < streetTypes.size(); i++) {
for (int j = i + 1; j < streetTypes.size(); j++) {
try {
double v = pairsAlgoritm.compare(streetTypes.get(i), streetTypes.get(j));
if (v > 0.80)
System.out.printf("%30s %30s %f \n", streetTypes.get(i), streetTypes.get(j), v);
} catch (Exception e) {
// System.err.println(streetTypes.get(i) + " " + streetTypes.get(j));
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;

import java.util.Set;
import java.util.List;

public interface OriginalAddressRepository extends JpaRepository<OriginalAddress, Long> {
public interface OriginalAddressRepository extends JpaRepository<OriginalAddress, Long> {
@Query("select TypeStreet from OriginalAddress GROUP BY TypeStreet")
Set<String> findUniqueStreetTypes();
List<String> findUniqueStreetTypes();

@Query("select City from OriginalAddress GROUP BY City")
List<String> findUniqueCities();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package in.arod.addressNormalizer.service;

public interface Algorithm {
float compare(String s1, String s2);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package in.arod.addressNormalizer.service.impl;

import in.arod.addressNormalizer.service.Algorithm;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;

@Service
public class JavaLangAlgorithm implements Algorithm {
@Override
public float compare(String s1, String s2) {
return (float) StringUtils.getJaroWinklerDistance(s1, s2);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package in.arod.addressNormalizer.service.impl;

import in.arod.addressNormalizer.service.Algorithm;
import org.springframework.stereotype.Service;

@Service
public class LevenshatinAlgorithm implements Algorithm {
@Override
public float compare(String s1, String s2) {
return levenstain(s1, s2);
}
public int levenstain(String str1, String str2) {
int[] Di_1 = new int[str2.length() + 1];
int[] Di = new int[str2.length() + 1];

for (int j = 0; j <= str2.length(); j++) {
Di[j] = j; // (i == 0)
}

for (int i = 1; i <= str1.length(); i++) {
System.arraycopy(Di, 0, Di_1, 0, Di_1.length);

Di[0] = i; // (j == 0)
for (int j = 1; j <= str2.length(); j++) {
int cost = (str1.charAt(i - 1) != str2.charAt(j - 1)) ? 1 : 0;
Di[j] = min(
Di_1[j] + 1,
Di[j - 1] + 1,
Di_1[j - 1] + cost
);
}
}

return Di[Di.length - 1];
}

private int min(int n1, int n2, int n3) {
return Math.min(Math.min(n1, n2), n3);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package in.arod.addressNormalizer.service.impl;

import in.arod.addressNormalizer.service.Algorithm;
import org.springframework.stereotype.Service;

@Service
public class LevenshatinCaseIgnoreAlgorithm implements Algorithm {
@Override
public float compare(String stringOne, String stringTwo) {
// if we want to ignore case sensitivity, lower case the strings

stringOne = stringOne.toLowerCase();
stringTwo = stringTwo.toLowerCase();


// store length
int m = stringOne.length();
int n = stringTwo.length();

// matrix to store differences
int[][] deltaM = new int[m + 1][n + 1];

for (int i = 1; i <= m; i++) {
deltaM[i][0] = i;
}

for (int j = 1; j <= n; j++) {
deltaM[0][j] = j;
}

for (int j = 1; j <= n; j++) {
for (int i = 1; i <= m; i++) {
if (stringOne.charAt(i - 1) == stringTwo.charAt(j - 1)) {
deltaM[i][j] = deltaM[i - 1][j - 1];
} else {
deltaM[i][j] = Math.min(
deltaM[i - 1][j] + 1,
Math.min(
deltaM[i][j - 1] + 1,
deltaM[i - 1][j - 1] + 1
)
);
}
}
}

return deltaM[m][n];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package in.arod.addressNormalizer.service.impl;

import in.arod.addressNormalizer.service.Algorithm;
import org.springframework.stereotype.Service;

import java.util.ArrayList;

@Service
public class PairsAlgorithm implements Algorithm {
@Override
public float compare(String str1, String str2) {
ArrayList pairs1 = wordLetterPairs(str1.toUpperCase());
ArrayList pairs2 = wordLetterPairs(str2.toUpperCase());
int intersection = 0;
int union = pairs1.size() + pairs2.size();
for (int i = 0; i < pairs1.size(); i++) {
Object pair1 = pairs1.get(i);
for (int j = 0; j < pairs2.size(); j++) {
Object pair2 = pairs2.get(j);
if (pair1.equals(pair2)) {
intersection++;
pairs2.remove(j);
break;
}
}
}
return (float) ((2.0 * intersection) / union);
}

/**
* @return an array of adjacent letter pairs contained in the input string
*/
private String[] letterPairs(String str) {
int numPairs = str.length() - 1;
String[] pairs = new String[numPairs];
for (int i = 0; i < numPairs; i++) {
pairs[i] = str.substring(i, i + 2);

}
return pairs;
}


/**
* @return an ArrayList of 2-character Strings.
*/
private ArrayList wordLetterPairs(String str) {
ArrayList allPairs = new ArrayList();
// Tokenize the string and put the tokens/words into an array
String[] words = str.split("\\s");
// For each word
for (int w = 0; w < words.length; w++) {
// Find the pairs of characters
String[] pairsInWord = letterPairs(words[w]);
for (int p = 0; p < pairsInWord.length; p++) {
allPairs.add(pairsInWord[p]);
}
}
return allPairs;
}
}
5 changes: 4 additions & 1 deletion src/main/resources/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@ spring.datasource.username=root
spring.datasource.password=root_pwd
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
spring.jpa.database-platform=org.hibernate.dialect.MySQL8Dialect
spring.jpa.show-sql: true
spring.jpa.show-sql: true


javaLangAlgo=1;

0 comments on commit 38f516f

Please sign in to comment.