Skip to content

Commit acecf30

Browse files
[MINOR] Avoid listing files for empty tables (apache#11155)
1 parent 7f8da18 commit acecf30

File tree

2 files changed

+23
-12
lines changed

2 files changed

+23
-12
lines changed

hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java

+12-5
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,14 @@
8383

8484
import java.io.FileNotFoundException;
8585
import java.io.IOException;
86+
import java.util.ArrayDeque;
8687
import java.util.ArrayList;
8788
import java.util.Collections;
8889
import java.util.HashMap;
8990
import java.util.LinkedList;
9091
import java.util.List;
9192
import java.util.Map;
93+
import java.util.Queue;
9294
import java.util.Set;
9395
import java.util.function.Function;
9496
import java.util.stream.Collectors;
@@ -761,7 +763,10 @@ private HoodieTableMetaClient initializeMetaClient() throws IOException {
761763
* @return List consisting of {@code DirectoryInfo} for each partition found.
762764
*/
763765
private List<DirectoryInfo> listAllPartitionsFromFilesystem(String initializationTime, Set<String> pendingDataInstants) {
764-
List<StoragePath> pathsToList = new LinkedList<>();
766+
if (dataMetaClient.getActiveTimeline().countInstants() == 0) {
767+
return Collections.emptyList();
768+
}
769+
Queue<StoragePath> pathsToList = new ArrayDeque<>();
765770
pathsToList.add(new StoragePath(dataWriteConfig.getBasePath()));
766771

767772
List<DirectoryInfo> partitionsToBootstrap = new LinkedList<>();
@@ -773,16 +778,18 @@ private List<DirectoryInfo> listAllPartitionsFromFilesystem(String initializatio
773778
while (!pathsToList.isEmpty()) {
774779
// In each round we will list a section of directories
775780
int numDirsToList = Math.min(fileListingParallelism, pathsToList.size());
781+
List<StoragePath> pathsToProcess = new ArrayList<>(numDirsToList);
782+
for (int i = 0; i < numDirsToList; i++) {
783+
pathsToProcess.add(pathsToList.poll());
784+
}
776785
// List all directories in parallel
777786
engineContext.setJobStatus(this.getClass().getSimpleName(), "Listing " + numDirsToList + " partitions from filesystem");
778-
List<DirectoryInfo> processedDirectories = engineContext.map(pathsToList.subList(0, numDirsToList), path -> {
787+
List<DirectoryInfo> processedDirectories = engineContext.map(pathsToProcess, path -> {
779788
HoodieStorage storage = new HoodieHadoopStorage(path, storageConf);
780789
String relativeDirPath = FSUtils.getRelativePartitionPath(storageBasePath, path);
781790
return new DirectoryInfo(relativeDirPath, storage.listDirectEntries(path), initializationTime, pendingDataInstants);
782791
}, numDirsToList);
783792

784-
pathsToList = new LinkedList<>(pathsToList.subList(numDirsToList, pathsToList.size()));
785-
786793
// If the listing reveals a directory, add it to queue. If the listing reveals a hoodie partition, add it to
787794
// the results.
788795
for (DirectoryInfo dirInfo : processedDirectories) {
@@ -815,10 +822,10 @@ private List<DirectoryInfo> listAllPartitionsFromFilesystem(String initializatio
815822
* @return List consisting of {@code DirectoryInfo} for each partition found.
816823
*/
817824
private List<DirectoryInfo> listAllPartitionsFromMDT(String initializationTime, Set<String> pendingDataInstants) throws IOException {
818-
List<DirectoryInfo> dirinfoList = new LinkedList<>();
819825
List<String> allPartitionPaths = metadata.getAllPartitionPaths().stream()
820826
.map(partitionPath -> dataWriteConfig.getBasePath() + StoragePath.SEPARATOR_CHAR + partitionPath).collect(Collectors.toList());
821827
Map<String, List<StoragePathInfo>> partitionFileMap = metadata.getAllFilesInPartitions(allPartitionPaths);
828+
List<DirectoryInfo> dirinfoList = new ArrayList<>(partitionFileMap.size());
822829
for (Map.Entry<String, List<StoragePathInfo>> entry : partitionFileMap.entrySet()) {
823830
dirinfoList.add(new DirectoryInfo(entry.getKey(), entry.getValue(), initializationTime, pendingDataInstants));
824831
}

hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java

+11-7
Original file line numberDiff line numberDiff line change
@@ -187,12 +187,12 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context)
187187

188188
List<SmallFile> smallFiles =
189189
filterSmallFilesInClustering(partitionPathToPendingClusteringFileGroupsId.getOrDefault(partitionPath, Collections.emptySet()),
190-
partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>()));
190+
partitionSmallFilesMap.getOrDefault(partitionPath, Collections.emptyList()));
191191

192192
this.smallFiles.addAll(smallFiles);
193193

194-
LOG.info("For partitionPath : " + partitionPath + " Total Small Files => " + smallFiles.size());
195-
LOG.debug("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
194+
LOG.info("For partitionPath : {} Total Small Files => {}", partitionPath, smallFiles.size());
195+
LOG.debug("For partitionPath : {} Small Files => {}", partitionPath, smallFiles);
196196

197197
long totalUnassignedInserts = pStat.getNumInserts();
198198
List<Integer> bucketNumbers = new ArrayList<>();
@@ -271,13 +271,17 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context)
271271
}
272272

273273
private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, HoodieEngineContext context) {
274-
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
275-
Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
276-
277274
if (config.getParquetSmallFileLimit() <= 0) {
278-
return partitionSmallFilesMap;
275+
return Collections.emptyMap();
276+
}
277+
278+
if (table.getMetaClient().getCommitsTimeline().filterCompletedInstants().countInstants() == 0) {
279+
return Collections.emptyMap();
279280
}
280281

282+
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
283+
Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
284+
281285
if (partitionPaths != null && partitionPaths.size() > 0) {
282286
context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions: " + config.getTableName());
283287
JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());

0 commit comments

Comments
 (0)