forked from mahmoudsebak/Search-Engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PageRank.java
116 lines (97 loc) · 3.9 KB
/
PageRank.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import java.util.HashMap;
import java.util.HashSet;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
public class PageRank
{
public static final int Iterations = 10;
public static void main(String[] args)
{
IndexerDbAdapter adapter = new IndexerDbAdapter();
adapter.open();
long startTime = System.nanoTime();
ArrayList<Pair<String,String>> connections = adapter.fetchAllLinks();
HashMap<String,Double> ret = CalculatePageRank(connections,adapter.getDocumentsNum());
adapter.resetPagesRank();
int i = 0;
for(HashMap.Entry<String,Double> entry : ret.entrySet())
{
adapter.updateURL(entry.getKey(), entry.getValue());
System.out.println("Page " + i++);
}
long endTime = System.nanoTime();
adapter.close();
System.out.println("Taken time : "+ ((endTime-startTime)*1.0/1e9));
}
/**
*
* @param connections: list of pairs <src url,dst url>
* @return map of each url and its pageRank
*/
public static HashMap<String,Double> CalculatePageRank(ArrayList<Pair<String,String>> connections,int total_links)
{
HashMap<String,Double> PagesRank = new HashMap<String, Double>();
HashMap<String,Integer> outDegree = new HashMap<String, Integer>();
HashMap<String,HashSet<String>> Pages = new HashMap<String, HashSet<String>>();
for(int i = 0 ; i < connections.size() ; i++)
{
String src = connections.get(i).first, dst = connections.get(i).second;
PagesRank.put(src,1.0);
PagesRank.put(dst,1.0);
URL url = null;
try {
url = new URL(src);
} catch (MalformedURLException e) {
e.printStackTrace();
}
String BaseURL = "";
if(url != null)
BaseURL = url.getProtocol() + "://" + url.getHost();
if(BaseURL.equals("") || dst.equals(BaseURL) || dst.equals(src)) // ignore back_ward connections to the parent link & self connections
continue;
if(!Pages.containsKey(dst))
Pages.put(dst,new HashSet<String>());
HashSet<String>temp = Pages.get(dst);
temp.add(src);
Pages.put(dst,temp);
if(outDegree.containsKey(src))
outDegree.put(src,outDegree.get(src)+1);
else
outDegree.put(src,1);
}
for(HashMap.Entry<String,Double> entry : PagesRank.entrySet())
entry.setValue(1.0/total_links);
for(int i = 0 ; i < Iterations ; i++)
{
HashMap<String,Double> temp = new HashMap<>();
for(HashMap.Entry<String,Double> entry : PagesRank.entrySet())
temp.put(entry.getKey(), entry.getValue());
for(HashMap.Entry<String,Double> entry : PagesRank.entrySet())
{
String Page = entry.getKey();
if(Pages.containsKey(Page))
{
double rank = 0;
HashSet<String> in_URL = Pages.get(Page);
Iterator<String> it = in_URL.iterator();
while(it.hasNext())
{
String in = it.next();
rank += (temp.get(in)/outDegree.get(in));
}
entry.setValue(rank);
}
}
}
//scaling PageRank
Double maxRank = 0.0;
for(HashMap.Entry<String,Double> entry : PagesRank.entrySet())
maxRank = Math.max(maxRank,entry.getValue());
if(maxRank > 0)
for(HashMap.Entry<String,Double> entry : PagesRank.entrySet())
entry.setValue(entry.getValue()/maxRank);
return PagesRank;
}
}