6666import org .apache .stormcrawler .proxy .SCProxy ;
6767import org .apache .stormcrawler .util .ConfUtils ;
6868import org .apache .stormcrawler .util .CookieConverter ;
69- import org .jetbrains .annotations .Nullable ;
7069import org .slf4j .LoggerFactory ;
7170
7271/** Uses Apache httpclient to handle http and https. */
@@ -82,6 +81,11 @@ public class HttpProtocol extends AbstractHttpProtocol
8281
8382 private HttpClientBuilder builder ;
8483
84+ private CloseableHttpClient client ;
85+
86+ private String userAgent ;
87+ private Collection <BasicHeader > defaultHeaders ;
88+
8589 private RequestConfig requestConfig ;
8690 private RequestConfig .Builder requestConfigBuilder ;
8791
@@ -102,9 +106,9 @@ public void configure(final Config conf) {
102106
103107 globalMaxContent = ConfUtils .getInt (conf , "http.content.limit" , -1 );
104108
105- String userAgent = getAgentString (conf );
109+ userAgent = getAgentString (conf );
106110
107- Collection < BasicHeader > defaultHeaders = new LinkedList <>();
111+ defaultHeaders = new LinkedList <>();
108112
109113 String accept = ConfUtils .getString (conf , "http.accept" );
110114 if (StringUtils .isNotBlank (accept )) {
@@ -153,6 +157,8 @@ public void configure(final Config conf) {
153157 .setCookieSpec (CookieSpecs .STANDARD );
154158
155159 requestConfig = requestConfigBuilder .build ();
160+
161+ client = builder .build ();
156162 }
157163
158164 @ Override
@@ -163,8 +169,8 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
163169 // set default request config to global config
164170 RequestConfig reqConfig = requestConfig ;
165171
166- // default to the shared builder for non-proxy requests
167- HttpClientBuilder clientBuilder = builder ;
172+ // default to the shared client for non-proxy requests
173+ CloseableHttpClient httpClient = client ;
168174
169175 // conditionally add a dynamic proxy
170176 if (proxyManager != null ) {
@@ -173,10 +179,12 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
173179 if (proxOptional .isPresent ()) {
174180 SCProxy prox = proxOptional .get ();
175181
176- // create local copies of builders to avoid polluting shared
177- // state across concurrent requests
182+ // create a new builder with the same defaults (user-agent,
183+ // headers) to avoid losing them in proxied requests
178184 HttpClientBuilder localBuilder =
179185 HttpClients .custom ()
186+ .setUserAgent (userAgent )
187+ .setDefaultHeaders (defaultHeaders )
180188 .setConnectionManager (CONNECTION_MANAGER )
181189 .setConnectionManagerShared (true )
182190 .disableRedirectHandling ()
@@ -215,7 +223,7 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
215223 System .currentTimeMillis () - buildStart );
216224
217225 LOG .debug ("fetching with " + prox .toString ());
218- clientBuilder = localBuilder ;
226+ httpClient = localBuilder . build () ;
219227 }
220228 }
221229
@@ -268,11 +276,7 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
268276
269277 request .setConfig (reqConfig );
270278
271- // no need to release the connection explicitly as this is handled
272- // automatically. The client itself must be closed though.
273- try (CloseableHttpClient httpclient = clientBuilder .build ()) {
274- return httpclient .execute (request , responseHandler );
275- }
279+ return httpClient .execute (request , responseHandler );
276280 }
277281
278282 private void addCookiesToRequest (HttpRequestBase request , Metadata md ) {
@@ -353,7 +357,6 @@ public ProtocolResponse handleResponse(final HttpResponse response) throws IOExc
353357 };
354358 }
355359
356- @ Nullable
357360 private static byte [] toByteArray (
358361 final HttpEntity entity , int maxContent , MutableBoolean trimmed ) throws IOException {
359362
@@ -363,35 +366,46 @@ private static byte[] toByteArray(
363366
364367 final InputStream instream = entity .getContent ();
365368 if (instream == null ) {
366- return null ;
367- }
368- Args .check (
369- (entity .getContentLength () <= Constants .MAX_ARRAY_SIZE )
370- || (maxContent >= 0 && maxContent <= Constants .MAX_ARRAY_SIZE ),
371- "HTTP entity too large to be buffered in memory" );
372- int reportedLength = (int ) entity .getContentLength ();
373- // set default size for buffer: 100 KB
374- int bufferInitSize = 102400 ;
375- if (reportedLength != -1 ) {
376- bufferInitSize = reportedLength ;
377- }
378- // avoid init of too large a buffer when we will trim anyway
379- if (maxContent != -1 && bufferInitSize > maxContent ) {
380- bufferInitSize = maxContent ;
369+ return new byte [] {};
381370 }
382- final ByteArrayBuffer buffer = new ByteArrayBuffer (bufferInitSize );
383- final byte [] tmp = new byte [4096 ];
384- int lengthRead ;
385- while ((lengthRead = instream .read (tmp )) != -1 ) {
386- // check whether we need to trim
387- if (maxContent != -1 && buffer .length () + lengthRead > maxContent ) {
388- buffer .append (tmp , 0 , maxContent - buffer .length ());
389- trimmed .setValue (true );
390- break ;
371+ try (instream ) {
372+ Args .check (
373+ (entity .getContentLength () <= Constants .MAX_ARRAY_SIZE )
374+ || (maxContent >= 0 && maxContent <= Constants .MAX_ARRAY_SIZE ),
375+ "HTTP entity too large to be buffered in memory" );
376+ int reportedLength = (int ) entity .getContentLength ();
377+ // set default size for buffer: 100 KB
378+ int bufferInitSize = 102400 ;
379+ if (reportedLength != -1 ) {
380+ bufferInitSize = reportedLength ;
391381 }
392- buffer .append (tmp , 0 , lengthRead );
382+ // avoid init of too large a buffer when we will trim anyway
383+ if (maxContent != -1 && bufferInitSize > maxContent ) {
384+ bufferInitSize = maxContent ;
385+ }
386+ final ByteArrayBuffer buffer = new ByteArrayBuffer (bufferInitSize );
387+ final byte [] tmp = new byte [4096 ];
388+ int lengthRead ;
389+ while ((lengthRead = instream .read (tmp )) != -1 ) {
390+ // check whether we need to trim
391+ if (maxContent != -1 && buffer .length () + lengthRead > maxContent ) {
392+ buffer .append (tmp , 0 , maxContent - buffer .length ());
393+ trimmed .setValue (true );
394+ break ;
395+ }
396+ buffer .append (tmp , 0 , lengthRead );
397+ }
398+ return buffer .toByteArray ();
399+ }
400+ }
401+
402+ @ Override
403+ public void cleanup () {
404+ try {
405+ client .close ();
406+ } catch (IOException e ) {
407+ LOG .error ("Error closing HTTP client" , e );
393408 }
394- return buffer .toByteArray ();
395409 }
396410
397411 public static void main (String [] args ) throws Exception {
0 commit comments