1
1
import org .jsoup .nodes .Document ;
2
2
import utils .Options ;
3
3
4
+ import java .util .stream .Collector ;
4
5
import java .util .stream .Collectors ;
5
6
import java .util .stream .Stream ;
6
7
import java .util .concurrent .ConcurrentHashMap ;
10
11
11
12
/**
12
13
* This class counts the number of images in a recursively-defined
13
- * folder structure using the Java sequential stream framework. The
14
- * root folder can either reside locally (filesystem -based) or
15
- * remotely (web-based).
14
+ * folder structure using the Java sequential stream framework and
15
+ * the {@code teeing} {@link Collector}. The root folder can either
16
+ * reside locally (filesystem -based) or remotely (web-based).
16
17
*/
17
18
class ImageCounter {
18
19
/**
@@ -22,6 +23,8 @@ class ImageCounter {
22
23
23
24
/**
24
25
* A cache of unique URIs that have already been processed.
26
+ * {@link KeySetView} is part of the {@link ConcurrentHashMap}
27
+ * class.
25
28
*/
26
29
private final KeySetView <Object , Boolean > mUniqueUris =
27
30
ConcurrentHashMap .newKeySet ();
@@ -60,32 +63,33 @@ private long countImages(String pageUri,
60
63
+ "]: Exceeded max depth of "
61
64
+ Options .instance ().maxDepth ());
62
65
66
+ // Return 0 if we've exceeded the depth param.
63
67
return 0 ;
64
68
}
65
69
66
70
// Atomically check to see if we've already visited this URL
67
71
// and add the new url to the hashset, so we don't try to
68
72
// revisit it again unnecessarily.
69
73
else if (mUniqueUris
74
+ // Get the ConcurrentHashMap that implements the
75
+ // KeySetView.
70
76
.getMap ()
77
+ // Perform the atomic-check-then-act operation.
71
78
.putIfAbsent (pageUri ,
72
79
mUniqueUris .getMappedValue ()) != null ) {
73
80
print (TAG
74
- + "[Depth"
75
- + depth
76
- + "]: Already processed "
77
- + pageUri );
81
+ + "[Depth"
82
+ + depth
83
+ + "]: Already processed "
84
+ + pageUri );
78
85
79
86
// Return 0 if we've already examined this url.
80
87
return 0 ;
81
- }
82
-
83
- // Synchronously (1) count the number of images on this page
84
- // and (2) crawl other hyperlinks accessible via this page and
85
- // count their images.
86
- else {
87
- long count = countImagesImpl (pageUri ,
88
- depth );
88
+ } else {
89
+ // Synchronously (1) count the number of images on this page
90
+ // and (2) crawl other hyperlinks accessible via this page and
91
+ // count their images.
92
+ long count = countImagesImpl (pageUri , depth );
89
93
print (TAG
90
94
+ "[Depth"
91
95
+ depth
@@ -95,6 +99,8 @@ else if (mUniqueUris
95
99
+ pageUri
96
100
+ " in thread "
97
101
+ Thread .currentThread ().getId ());
102
+
103
+ // Return the count from this level in the traversal.
98
104
return count ;
99
105
}
100
106
}
@@ -164,7 +170,7 @@ private long getCountOfImagesInPage(Document page) {
164
170
/**
165
171
* Recursively crawl through hyperlinks that are in {@code page}.
166
172
*
167
- * @param page The page containing HTML
173
+ * @param page The page containing the HTML document
168
174
* @param depth The depth of the level of web page traversal
169
175
* @return A count of how many images were in each hyperlink on
170
176
* the page
0 commit comments