Skip to content

Commit

Permalink
Replace elasticsearch client with a pure HTTP client
Browse files Browse the repository at this point in the history
We don't need to depend on elasticsearch itself so we can reduce a lot the size of the final artifact.
We will just use here HTTP simple java client.

Closes dadoonet#117.
  • Loading branch information
dadoonet committed Oct 30, 2015
1 parent 5145d48 commit 81ee668
Show file tree
Hide file tree
Showing 39 changed files with 2,193 additions and 773 deletions.
69 changes: 64 additions & 5 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,23 @@
</build>

<dependencies>
<!--
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.1</version>
</dependency>
-->
<dependency>
<groupId>com.google.http-client</groupId>
<artifactId>google-http-client</artifactId>
<version>1.20.0</version>
</dependency>
<dependency>
<groupId>com.google.http-client</groupId>
<artifactId>google-http-client-jackson2</artifactId>
<version>1.20.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
Expand All @@ -160,21 +177,63 @@
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>${jackson.version}</version>
</dependency>

<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>${elasticsearch.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${tika.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>${tika.version}</version>
<exclusions>
<!-- Not Apache2 License compatible -->
<exclusion>
<groupId>edu.ucar</groupId>
<artifactId>netcdf</artifactId>
</exclusion>
<!-- Not Apache2 License compatible -->
<exclusion>
<groupId>edu.ucar</groupId>
<artifactId>cdm</artifactId>
</exclusion>
<!-- Not Apache2 License compatible -->
<exclusion>
<groupId>edu.ucar</groupId>
<artifactId>httpservices</artifactId>
</exclusion>
<!-- Not Apache2 License compatible -->
<exclusion>
<groupId>edu.ucar</groupId>
<artifactId>grib</artifactId>
</exclusion>
<!-- Not Apache2 License compatible -->
<exclusion>
<groupId>edu.ucar</groupId>
<artifactId>netcdf4</artifactId>
</exclusion>
<!-- Not Apache2 License compatible -->
<exclusion>
<groupId>com.uwyn</groupId>
<artifactId>jhighlight</artifactId>
</exclusion>
<!-- ES core already has these -->
<exclusion>
<groupId>org.ow2.asm</groupId>
<artifactId>asm-debug-all</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--Dependency for parsing remote ssh directory [http://www.jcraft.com/jsch/]-->
<dependency>
Expand Down
310 changes: 165 additions & 145 deletions src/main/java/fr/pilato/elasticsearch/crawler/fs/FsCrawlerImpl.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/*
* Licensed to David Pilato (the "Author") under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Author licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package fr.pilato.elasticsearch.crawler.fs.client;

import fr.pilato.elasticsearch.crawler.fs.meta.settings.TimeValue;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

/**
* Bulk processor
*/
public class BulkProcessor {

private static final Logger logger = LogManager.getLogger(BulkProcessor.class);

private final int bulkActions;
private final ElasticsearchClient client;
private final Listener listener;
private BulkRequest bulkRequest;
private final ScheduledExecutorService executor;
private volatile boolean closed = false;
private final AtomicLong executionIdGen = new AtomicLong();

public BulkProcessor(ElasticsearchClient client, Listener listener, int bulkActions, TimeValue flushInterval) {
this.bulkActions = bulkActions;
this.bulkRequest = new BulkRequest();
this.client = client;
this.listener = listener;

if (flushInterval != null) {
executor = Executors.newScheduledThreadPool(1);
executor.scheduleWithFixedDelay(this::executeWhenNeeded, 0, flushInterval.millis(), TimeUnit.MILLISECONDS);
} else {
executor = null;
}
}

public void close() {
if (closed) {
return;
}
closed = true;

if (executor != null) {
executor.shutdown();
}

if (bulkRequest.numberOfActions() > 0) {
execute();
}
}

/**
* Adds an {@link IndexRequest} to the list of actions to execute. Follows the same behavior of {@link IndexRequest}
* (for example, if no id is provided, one will be generated, or usage of the create flag).
*/
public BulkProcessor add(IndexRequest request) {
return add((SingleBulkRequest) request);
}

/**
* Adds an {@link DeleteRequest} to the list of actions to execute.
*/
public BulkProcessor add(DeleteRequest request) {
return add((SingleBulkRequest) request);
}

/**
* Adds either a delete or an index request.
*/
public BulkProcessor add(SingleBulkRequest request) {
return internalAdd(request);
}

protected void ensureOpen() {
if (closed) {
throw new IllegalStateException("bulk process already closed");
}
}

private synchronized BulkProcessor internalAdd(SingleBulkRequest request) {
ensureOpen();
bulkRequest.add(request);
executeIfNeeded();
return this;
}

private void executeIfNeeded() {
ensureOpen();
if (isOverTheLimit()) {
execute();
}
}

private void executeWhenNeeded() {
ensureOpen();
if (bulkRequest.numberOfActions() > 0) {
execute();
}
}

private void execute() {
final BulkRequest bulkRequest = this.bulkRequest;
this.bulkRequest = new BulkRequest();
final long executionId = executionIdGen.incrementAndGet();

// execute in a blocking fashion...
boolean afterCalled = false;
try {
listener.beforeBulk(executionId, bulkRequest);
BulkResponse bulkItemResponses = client.bulk(bulkRequest);
afterCalled = true;
listener.afterBulk(executionId, bulkRequest, bulkItemResponses);
} catch (Exception e) {
if (!afterCalled) {
listener.afterBulk(executionId, bulkRequest, e);
}
}
}

private boolean isOverTheLimit() {
return (bulkActions != -1) && (bulkRequest.numberOfActions() >= bulkActions);
}

static class Builder {

private int bulkActions;
private TimeValue flushInterval;
private final ElasticsearchClient client;
private final Listener listener;

public Builder(ElasticsearchClient client, Listener listener) {
this.client = client;
this.listener = listener;
}

public Builder setBulkActions(int bulkActions) {
this.bulkActions = bulkActions;
return this;
}

public Builder setFlushInterval(TimeValue flushInterval) {
this.flushInterval = flushInterval;
return this;
}

public BulkProcessor build() {
return new BulkProcessor(client, listener, bulkActions, flushInterval);
}
}

static Builder builder(ElasticsearchClient client, Listener listener) {
return new Builder(client, listener);
}

public interface Listener {

void beforeBulk(long executionId, BulkRequest request);

void afterBulk(long executionId, BulkRequest request, BulkResponse response);

void afterBulk(long executionId, BulkRequest request, Throwable failure);
}
/**
* Build an simple elasticsearch bulk processor
* @param client elasticsearch client
* @param bulkSize bulk size
* @param flushInterval flush interval in milliseconds
* @return a bulk processor
*/
public static BulkProcessor simpleBulkProcessor(ElasticsearchClient client, int bulkSize, TimeValue flushInterval) {
return builder(client, new Listener() {
@Override
public void beforeBulk(long executionId, BulkRequest request) {
logger.debug("Going to execute new bulk composed of {} actions", request.numberOfActions());
}

@Override
public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
logger.debug("Executed bulk composed of {} actions", request.numberOfActions());
if (response.hasFailures()) {
logger.warn("There was failures while executing bulk", response.buildFailureMessage());
if (logger.isDebugEnabled()) {
for (BulkResponse.BulkItemResponse item : response.getItems()) {
if (item.isFailed()) {
logger.debug("Error for {}/{}/{} for {} operation: {}", item.getIndex(),
item.getType(), item.getId(), item.getOpType(), item.getFailureMessage());
}
}
}
}
}

@Override
public void afterBulk(long executionId, BulkRequest request, Throwable failure) {
logger.warn("Error executing bulk", failure);
}
})
.setBulkActions(bulkSize)
.setFlushInterval(flushInterval)
.build();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Licensed to David Pilato (the "Author") under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Author licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package fr.pilato.elasticsearch.crawler.fs.client;

import com.google.common.collect.Lists;

import java.util.List;

public class BulkRequest {

private final List<SingleBulkRequest> requests = Lists.newArrayList();

public int numberOfActions() {
return requests.size();
}

public void add(SingleBulkRequest request) {
requests.add(request);
}

public List<SingleBulkRequest> getRequests() {
return requests;
}
}
Loading

0 comments on commit 81ee668

Please sign in to comment.