@@ -83,7 +83,7 @@ public function clean(string $html, array $config = []): string
8383 }
8484
8585 // convert line-breaks to UNIX
86- $ this ->convNlOs ( $ html );
86+ $ html = preg_replace ( " ( \r\n | \r ) " , $ this ->newline , $ html );
8787
8888 $ manipulations = [];
8989
@@ -95,16 +95,17 @@ public function clean(string $html, array $config = []): string
9595 $ manipulations ['removeComments ' ] = GeneralUtility::makeInstance (RemoveComments::class);
9696 }
9797
98- if (!empty ($ this ->headerComment )) {
99- $ this ->includeHeaderComment ($ html );
100- }
101-
10298 foreach ($ manipulations as $ key => $ manipulation ) {
10399 /** @var ManipulationInterface $manipulation */
104100 $ configuration = isset ($ config [$ key . '. ' ]) && \is_array ($ config [$ key . '. ' ]) ? $ config [$ key . '. ' ] : [];
105101 $ html = $ manipulation ->manipulate ($ html , $ configuration );
106102 }
107103
104+ // include configured header comment in HTML content block
105+ if (!empty ($ this ->headerComment )) {
106+ $ html = preg_replace ('/^(-->)$/m ' , "\n\t" . $ this ->headerComment . "\n$1 " , $ html , 1 );
107+ }
108+
108109 // cleanup HTML5 self-closing elements
109110 if (!isset ($ GLOBALS ['TSFE ' ]->config ['config ' ]['doctype ' ])
110111 || 'x ' !== substr ($ GLOBALS ['TSFE ' ]->config ['config ' ]['doctype ' ], 0 , 1 )) {
@@ -115,276 +116,16 @@ public function clean(string $html, array $config = []): string
115116 );
116117 }
117118
118- if ($ this ->formatType > 0 ) {
119- $ html = $ this ->formatHtml ($ html );
119+ if ($ this ->formatType ) {
120+ $ indenter = new \Gajus \Dindent \Indenter (['indentation_character ' => $ this ->tab ]);
121+ $ html = $ indenter ->indent ($ html );
120122 }
121123
122- // remove white space after line ending
123- $ this ->rTrimLines ($ html );
124-
125124 // recover line-breaks
126125 if (Environment::isWindows ()) {
127126 $ html = str_replace ($ this ->newline , "\r\n" , $ html );
128127 }
129128
130129 return (string ) $ html ;
131130 }
132-
133- /**
134- * Formats the (X)HTML code:
135- * - taps according to the hirarchy of the tags
136- * - removes empty spaces between tags
137- * - removes linebreaks within tags (spares where necessary: pre, textarea, comments, ..)
138- * choose from five options:
139- * 0 => off
140- * 1 => no line break at all (code in one line)
141- * 2 => minimalistic line breaks (structure defining box-elements)
142- * 3 => aesthetic line breaks (important box-elements)
143- * 4 => logic line breaks (all box-elements)
144- * 5 => max line breaks (all elements).
145- */
146- protected function formatHtml (string $ html ): string
147- {
148- // Save original formated pre, textarea, comments, styles and scripts & replace them with markers
149- preg_match_all (
150- '/(?s)((<!--.*?-->)|(<[ \n\r]*pre[^>]*>.*?<[ \n\r]*\/pre[^>]*>)|(<[ \n\r]*textarea[^>]*>.*?<[ \n\r]*\/textarea[^>]*>)|(<[ \n\r]*style[^>]*>.*?<[ \n\r]*\/style[^>]*>)|(<[ \n\r]*script[^>]*>.*?<[ \n\r]*\/script[^>]*>))/im ' ,
151- $ html ,
152- $ matches
153- );
154- $ noFormat = $ matches [0 ]; // do not format these block elements
155- for ($ i = 0 ; $ i < \count ($ noFormat ); ++$ i ) {
156- $ html = str_replace ($ noFormat [$ i ], "\n<!-- ELEMENT {$ i } --> " , $ html );
157- }
158-
159- // define box elements for formatting
160- $ trueBoxElements = 'address|blockquote|center|dir|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|isindex|menu|noframes|noscript|ol|p|pre|table|ul|article|aside|details|figcaption|figure|footer|header|hgroup|menu|nav|section ' ;
161- $ functionalBoxElements = 'dd|dt|frameset|li|tbody|td|tfoot|th|thead|tr|colgroup ' ;
162- $ usableBoxElements = 'applet|button|del|iframe|ins|map|object|script ' ;
163- $ imagineBoxElements = 'html|body|head|meta|title|link|script|base|!-- ' ;
164- $ allBoxLikeElements = '(?> ' . $ trueBoxElements . '| ' . $ functionalBoxElements . '| ' . $ usableBoxElements . '| ' . $ imagineBoxElements . ') ' ;
165- $ esteticBoxLikeElements = '(?>html|head|body|meta name|title|div|table|h1|h2|h3|h4|h5|h6|p|form|pre|center|!--) ' ;
166- $ structureBoxLikeElements = '(?>html|head|body|div|!--) ' ;
167-
168- // split html into it's elements
169- $ htmlArrayTemp = preg_split (
170- '/(<(?:[^<>]+(?:"[^"]*"| \'[^ \']* \')?)+>)/ ' ,
171- $ html ,
172- -1 ,
173- \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
174- );
175-
176- if (false === $ htmlArrayTemp ) {
177- // Restore saved comments, styles and scripts
178- for ($ i = 0 ; $ i < \count ($ noFormat ); ++$ i ) {
179- $ html = str_replace ("<!-- ELEMENT {$ i } --> " , $ noFormat [$ i ], $ html );
180- }
181-
182- return $ html ;
183- }
184- // remove empty lines
185- $ htmlArray = ['' ];
186- $ index = 1 ;
187- for ($ x = 0 ; $ x < \count ($ htmlArrayTemp ); ++$ x ) {
188- $ text = trim ($ htmlArrayTemp [$ x ]);
189- $ htmlArray [$ index ] = '' !== $ text ? $ htmlArrayTemp [$ x ] : $ this ->emptySpaceChar ;
190- ++$ index ;
191- }
192-
193- // rebuild html
194- $ html = '' ;
195- $ tabs = 0 ;
196- for ($ x = 0 ; $ x < \count ($ htmlArray ); ++$ x ) {
197- $ htmlArrayBefore = $ htmlArray [$ x - 1 ] ?? '' ;
198- $ htmlArrayCurrent = $ htmlArray [$ x ] ?? '' ;
199-
200- // check if the element should stand in a new line
201- $ newline = false ;
202- if ('<?xml ' == substr ($ htmlArrayBefore , 0 , 5 )) {
203- $ newline = true ;
204- } elseif (2 == $ this ->formatType && ( // minimalistic line break
205- // this element has a line break before itself
206- preg_match (
207- '/< ' . $ structureBoxLikeElements . '(.*)>/Usi ' ,
208- $ htmlArrayCurrent
209- ) || preg_match (
210- '/< ' . $ structureBoxLikeElements . '(.*) \/>/Usi ' ,
211- $ htmlArrayCurrent
212- ) // one element before is a element that has a line break after
213- || preg_match (
214- '/<\/ ' . $ structureBoxLikeElements . '(.*)>/Usi ' ,
215- $ htmlArrayBefore
216- ) || '<!-- ' == substr (
217- $ htmlArrayBefore ,
218- 0 ,
219- 4
220- ) || preg_match ('/< ' . $ structureBoxLikeElements . '(.*) \/>/Usi ' , $ htmlArrayBefore ))
221- ) {
222- $ newline = true ;
223- } elseif (3 == $ this ->formatType && ( // aestetic line break
224- // this element has a line break before itself
225- preg_match (
226- '/< ' . $ esteticBoxLikeElements . '(.*)>/Usi ' ,
227- $ htmlArrayCurrent
228- ) || preg_match (
229- '/< ' . $ esteticBoxLikeElements . '(.*) \/>/Usi ' ,
230- $ htmlArrayCurrent
231- ) // one element before is a element that has a line break after
232- || preg_match ('/<\/ ' . $ esteticBoxLikeElements . '(.*)>/Usi ' , $ htmlArrayBefore ) || '<!-- ' == substr (
233- $ htmlArrayBefore ,
234- 0 ,
235- 4
236- ) || preg_match ('/< ' . $ esteticBoxLikeElements . '(.*) \/>/Usi ' , $ htmlArrayBefore ))
237- ) {
238- $ newline = true ;
239- } elseif ($ this ->formatType >= 4 && ( // logical line break
240- // this element has a line break before itself
241- preg_match (
242- '/< ' . $ allBoxLikeElements . '(.*)>/Usi ' ,
243- $ htmlArrayCurrent
244- ) || preg_match (
245- '/< ' . $ allBoxLikeElements . '(.*) \/>/Usi ' ,
246- $ htmlArrayCurrent
247- ) // one element before is a element that has a line break after
248- || preg_match ('/<\/ ' . $ allBoxLikeElements . '(.*)>/Usi ' , $ htmlArrayBefore ) || '<!-- ' == substr (
249- $ htmlArrayBefore ,
250- 0 ,
251- 4
252- ) || preg_match ('/< ' . $ allBoxLikeElements . '(.*) \/>/Usi ' , $ htmlArrayBefore ))
253- ) {
254- $ newline = true ;
255- }
256-
257- // count down a tab
258- if ('</ ' == substr ($ htmlArrayCurrent , 0 , 2 )) {
259- --$ tabs ;
260- }
261-
262- // add tabs and line breaks in front of the current tag
263- if ($ newline ) {
264- $ html .= $ this ->newline ;
265- for ($ y = 0 ; $ y < $ tabs ; ++$ y ) {
266- $ html .= $ this ->tab ;
267- }
268- }
269-
270- // remove white spaces and line breaks and add current tag to the html-string
271- if ('<![CDATA[ ' == substr ($ htmlArrayCurrent , 0 , 9 ) // remove multiple white space in CDATA / XML
272- || '<?xml ' == substr ($ htmlArrayCurrent , 0 , 5 )
273- ) {
274- $ html .= $ this ->killWhiteSpace ($ htmlArrayCurrent );
275- } else { // remove all line breaks
276- $ html .= $ this ->killLineBreaks ($ htmlArrayCurrent );
277- }
278-
279- // count up a tab
280- if ('< ' == substr ($ htmlArrayCurrent , 0 , 1 ) && '/ ' != substr ($ htmlArrayCurrent , 1 , 1 )) {
281- if (' ' !== substr ($ htmlArrayCurrent , 1 , 1 )
282- && 'img ' !== substr ($ htmlArrayCurrent , 1 , 3 )
283- && 'source ' !== substr ($ htmlArrayCurrent , 1 , 6 )
284- && 'br ' !== substr ($ htmlArrayCurrent , 1 , 2 )
285- && 'hr ' !== substr ($ htmlArrayCurrent , 1 , 2 )
286- && 'input ' !== substr ($ htmlArrayCurrent , 1 , 5 )
287- && 'link ' !== substr ($ htmlArrayCurrent , 1 , 4 )
288- && 'meta ' !== substr ($ htmlArrayCurrent , 1 , 4 )
289- && 'col ' !== substr ($ htmlArrayCurrent , 1 , 4 )
290- && 'frame ' !== substr ($ htmlArrayCurrent , 1 , 5 )
291- && 'isindex ' !== substr ($ htmlArrayCurrent , 1 , 7 )
292- && 'param ' !== substr ($ htmlArrayCurrent , 1 , 5 )
293- && 'area ' !== substr ($ htmlArrayCurrent , 1 , 4 )
294- && 'base ' !== substr ($ htmlArrayCurrent , 1 , 4 )
295- && '<! ' !== substr ($ htmlArrayCurrent , 0 , 2 )
296- && '<?xml ' !== substr ($ htmlArrayCurrent , 0 , 5 )
297- ) {
298- ++$ tabs ;
299- }
300- }
301- }
302-
303- // Remove empty lines
304- if ($ this ->formatType > 1 ) {
305- $ this ->removeEmptyLines ($ html );
306- }
307-
308- // Restore saved comments, styles and scripts
309- for ($ i = 0 ; $ i < \count ($ noFormat ); ++$ i ) {
310- $ html = str_replace ("<!-- ELEMENT {$ i } --> " , $ noFormat [$ i ], $ html );
311- }
312-
313- // include debug comment at the end
314- if (0 != $ tabs && true === $ this ->debugComment ) {
315- $ html .= "<!-- {$ tabs } open elements found --> " ;
316- }
317-
318- return $ html ;
319- }
320-
321- /**
322- * Remove ALL line breaks and multiple white space.
323- */
324- protected function killLineBreaks (string $ html ): string
325- {
326- $ html = str_replace ($ this ->newline , '' , $ html );
327-
328- return preg_replace ('/\s\s+/u ' , ' ' , $ html );
329- // ? return preg_replace('/\n|\s+(\s)/u', '$1', $html);
330- }
331-
332- /**
333- * Remove multiple white space, keeps line breaks.
334- */
335- protected function killWhiteSpace (string $ html ): string
336- {
337- $ temp = explode ($ this ->newline , $ html );
338- for ($ i = 0 ; $ i < \count ($ temp ); ++$ i ) {
339- if (!trim ($ temp [$ i ])) {
340- unset($ temp [$ i ]);
341- continue ;
342- }
343-
344- $ temp [$ i ] = trim ($ temp [$ i ]);
345- $ temp [$ i ] = preg_replace ('/\s\s+/ ' , ' ' , $ temp [$ i ]);
346- }
347-
348- return implode ($ this ->newline , $ temp );
349- }
350-
351- /**
352- * Remove white space at the end of lines, keeps other white space and line breaks.
353- */
354- protected function rTrimLines (string &$ html ): void
355- {
356- $ html = preg_replace ('/\s+$/m ' , '' , $ html );
357- }
358-
359- /**
360- * Convert newlines according to the current OS.
361- */
362- protected function convNlOs (string &$ html ): void
363- {
364- $ html = preg_replace ("( \r\n| \r) " , $ this ->newline , $ html );
365- }
366-
367- /**
368- * Remove empty lines.
369- */
370- protected function removeEmptyLines (string &$ html ): void
371- {
372- $ temp = explode ($ this ->newline , $ html );
373- $ result = [];
374- for ($ i = 0 ; $ i < \count ($ temp ); ++$ i ) {
375- if ('' == trim ($ temp [$ i ])) {
376- continue ;
377- }
378- $ result [] = $ temp [$ i ];
379- }
380- $ html = implode ($ this ->newline , $ result );
381- }
382-
383- /**
384- * Include configured header comment in HTML content block.
385- */
386- public function includeHeaderComment (string &$ html ): void
387- {
388- $ html = preg_replace ('/^(-->)$/m ' , "\n\t" . $ this ->headerComment . "\n$1 " , $ html );
389- }
390131}
0 commit comments