Skip to content

Commit d485783

Browse files
authored
Fix bugs and resource leaks in Apache HttpClient protocol implementation (#1863)
* Fix bugs and resource leaks in Apache HttpClient protocol implementation The proxy race condition fix (a08a456) introduced two regressions: The local builder for proxied requests lost: Default headers User-Agent A new CloseableHttpClient was created on every request, even when no proxy was used Fixes applied Store userAgent and defaultHeaders as fields Pass these fields to the proxy builder Build the client once in configure() and reuse it for non-proxy requests Additional fixes Fix toByteArray() returning null when the entity input stream is null Close the static CONNECTION_MANAGER in cleanup() Wrap the entity InputStream in a try-with-resources block * Revert: Close the static CONNECTION_MANAGER in cleanup()
1 parent c179bae commit d485783

1 file changed

Lines changed: 54 additions & 40 deletions

File tree

core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java

Lines changed: 54 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@
6666
import org.apache.stormcrawler.proxy.SCProxy;
6767
import org.apache.stormcrawler.util.ConfUtils;
6868
import org.apache.stormcrawler.util.CookieConverter;
69-
import org.jetbrains.annotations.Nullable;
7069
import org.slf4j.LoggerFactory;
7170

7271
/** Uses Apache httpclient to handle http and https. */
@@ -82,6 +81,11 @@ public class HttpProtocol extends AbstractHttpProtocol
8281

8382
private HttpClientBuilder builder;
8483

84+
private CloseableHttpClient client;
85+
86+
private String userAgent;
87+
private Collection<BasicHeader> defaultHeaders;
88+
8589
private RequestConfig requestConfig;
8690
private RequestConfig.Builder requestConfigBuilder;
8791

@@ -102,9 +106,9 @@ public void configure(final Config conf) {
102106

103107
globalMaxContent = ConfUtils.getInt(conf, "http.content.limit", -1);
104108

105-
String userAgent = getAgentString(conf);
109+
userAgent = getAgentString(conf);
106110

107-
Collection<BasicHeader> defaultHeaders = new LinkedList<>();
111+
defaultHeaders = new LinkedList<>();
108112

109113
String accept = ConfUtils.getString(conf, "http.accept");
110114
if (StringUtils.isNotBlank(accept)) {
@@ -153,6 +157,8 @@ public void configure(final Config conf) {
153157
.setCookieSpec(CookieSpecs.STANDARD);
154158

155159
requestConfig = requestConfigBuilder.build();
160+
161+
client = builder.build();
156162
}
157163

158164
@Override
@@ -163,8 +169,8 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
163169
// set default request config to global config
164170
RequestConfig reqConfig = requestConfig;
165171

166-
// default to the shared builder for non-proxy requests
167-
HttpClientBuilder clientBuilder = builder;
172+
// default to the shared client for non-proxy requests
173+
CloseableHttpClient httpClient = client;
168174

169175
// conditionally add a dynamic proxy
170176
if (proxyManager != null) {
@@ -173,10 +179,12 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
173179
if (proxOptional.isPresent()) {
174180
SCProxy prox = proxOptional.get();
175181

176-
// create local copies of builders to avoid polluting shared
177-
// state across concurrent requests
182+
// create a new builder with the same defaults (user-agent,
183+
// headers) to avoid losing them in proxied requests
178184
HttpClientBuilder localBuilder =
179185
HttpClients.custom()
186+
.setUserAgent(userAgent)
187+
.setDefaultHeaders(defaultHeaders)
180188
.setConnectionManager(CONNECTION_MANAGER)
181189
.setConnectionManagerShared(true)
182190
.disableRedirectHandling()
@@ -215,7 +223,7 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
215223
System.currentTimeMillis() - buildStart);
216224

217225
LOG.debug("fetching with " + prox.toString());
218-
clientBuilder = localBuilder;
226+
httpClient = localBuilder.build();
219227
}
220228
}
221229

@@ -268,11 +276,7 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
268276

269277
request.setConfig(reqConfig);
270278

271-
// no need to release the connection explicitly as this is handled
272-
// automatically. The client itself must be closed though.
273-
try (CloseableHttpClient httpclient = clientBuilder.build()) {
274-
return httpclient.execute(request, responseHandler);
275-
}
279+
return httpClient.execute(request, responseHandler);
276280
}
277281

278282
private void addCookiesToRequest(HttpRequestBase request, Metadata md) {
@@ -353,7 +357,6 @@ public ProtocolResponse handleResponse(final HttpResponse response) throws IOExc
353357
};
354358
}
355359

356-
@Nullable
357360
private static byte[] toByteArray(
358361
final HttpEntity entity, int maxContent, MutableBoolean trimmed) throws IOException {
359362

@@ -363,35 +366,46 @@ private static byte[] toByteArray(
363366

364367
final InputStream instream = entity.getContent();
365368
if (instream == null) {
366-
return null;
367-
}
368-
Args.check(
369-
(entity.getContentLength() <= Constants.MAX_ARRAY_SIZE)
370-
|| (maxContent >= 0 && maxContent <= Constants.MAX_ARRAY_SIZE),
371-
"HTTP entity too large to be buffered in memory");
372-
int reportedLength = (int) entity.getContentLength();
373-
// set default size for buffer: 100 KB
374-
int bufferInitSize = 102400;
375-
if (reportedLength != -1) {
376-
bufferInitSize = reportedLength;
377-
}
378-
// avoid init of too large a buffer when we will trim anyway
379-
if (maxContent != -1 && bufferInitSize > maxContent) {
380-
bufferInitSize = maxContent;
369+
return new byte[] {};
381370
}
382-
final ByteArrayBuffer buffer = new ByteArrayBuffer(bufferInitSize);
383-
final byte[] tmp = new byte[4096];
384-
int lengthRead;
385-
while ((lengthRead = instream.read(tmp)) != -1) {
386-
// check whether we need to trim
387-
if (maxContent != -1 && buffer.length() + lengthRead > maxContent) {
388-
buffer.append(tmp, 0, maxContent - buffer.length());
389-
trimmed.setValue(true);
390-
break;
371+
try (instream) {
372+
Args.check(
373+
(entity.getContentLength() <= Constants.MAX_ARRAY_SIZE)
374+
|| (maxContent >= 0 && maxContent <= Constants.MAX_ARRAY_SIZE),
375+
"HTTP entity too large to be buffered in memory");
376+
int reportedLength = (int) entity.getContentLength();
377+
// set default size for buffer: 100 KB
378+
int bufferInitSize = 102400;
379+
if (reportedLength != -1) {
380+
bufferInitSize = reportedLength;
391381
}
392-
buffer.append(tmp, 0, lengthRead);
382+
// avoid init of too large a buffer when we will trim anyway
383+
if (maxContent != -1 && bufferInitSize > maxContent) {
384+
bufferInitSize = maxContent;
385+
}
386+
final ByteArrayBuffer buffer = new ByteArrayBuffer(bufferInitSize);
387+
final byte[] tmp = new byte[4096];
388+
int lengthRead;
389+
while ((lengthRead = instream.read(tmp)) != -1) {
390+
// check whether we need to trim
391+
if (maxContent != -1 && buffer.length() + lengthRead > maxContent) {
392+
buffer.append(tmp, 0, maxContent - buffer.length());
393+
trimmed.setValue(true);
394+
break;
395+
}
396+
buffer.append(tmp, 0, lengthRead);
397+
}
398+
return buffer.toByteArray();
399+
}
400+
}
401+
402+
@Override
403+
public void cleanup() {
404+
try {
405+
client.close();
406+
} catch (IOException e) {
407+
LOG.error("Error closing HTTP client", e);
393408
}
394-
return buffer.toByteArray();
395409
}
396410

397411
public static void main(String[] args) throws Exception {

0 commit comments

Comments
 (0)