34.12KiB; Perl | 2021-02-26 14:28:32+01 | Statements 646 | SLOC 1107
1
package CGI::Buffer;
2
3
use 5.14.0;	# For IO::Compress::Brotli
4
use strict;
5
use warnings;
6
7
use Digest::MD5;
8
use IO::String;
9
use CGI::Info;
10
use Carp;
11
use HTTP::Date;
12
use Text::Diff;	# For debugging
13
14
=head1 NAME
15
16
CGI::Buffer - Verify, Cache and Optimise CGI Output
17
18
=head1 VERSION
19
20
Version 0.82
21
22
=cut
23
24
our $VERSION = '0.82';
25
26
=head1 SYNOPSIS
27
28
CGI::Buffer verifies the HTML that you produce by passing it through
29
C<HTML::Lint>.
30
31
CGI::Buffer optimises CGI programs by reducing, filtering and compressing
32
output to speed up the transmission and by nearly seamlessly making use of
33
client and server caches.
34
35
To make use of client caches, that is to say to reduce needless calls
36
to your server asking for the same data, all you need to do is to
37
include the package, and it does the rest.
38
39
    use CGI::Buffer;
40
    # ...
41
42
To also make use of server caches, that is to say to save regenerating
43
output when different clients ask you for the same data, you will need
44
to create a cache.
45
But that's simple:
46
47
    use CHI;
48
    use CGI::Buffer;
49
50
    # Put this at the top before you output anything
51
    CGI::Buffer::init(
52
	cache => CHI->new(driver => 'File')
53
    );
54
    if(CGI::Buffer::is_cached()) {
55
	# Nothing has changed - use the version in the cache
56
	exit;
57
    }
58
59
    # ...
60
61
To temporarily prevent the use of server-side caches, for example whilst
62
debugging before publishing a code change, set the NO_CACHE environment variable
63
to any non-zero value.
64
If you get errors about Wide characters in print it means that you've
65
forgotten to emit pure HTML on non-ASCII characters.
66
See L<HTML::Entities>.
67
As a hack work around you could also remove accents and the like by using
68
L<Text::Unidecode>,
69
which works well but isn't really what you want.
70
71
=head1 SUBROUTINES/METHODS
72
73
=cut
74
75
use constant MIN_GZIP_LEN => 32;
76
77
our $generate_etag = 1;
78
our $generate_304 = 1;
79
our $generate_last_modified = 1;
80
our $compress_content = 1;
81
our $optimise_content = 0;
82
our $lint_content = 0;
83
our $cache;
84
our $cache_age;
85
our $cache_key;
86
our $info;
87
our $logger;
88
our $lingua;
89
our $status;
90
our $script_mtime;
91
our $cobject;
92
our($x_cache, $buf, $headers, $header, $body, @content_type, $etag,
93
	$send_body, @o, $encode_loaded);
94
95
BEGIN {
96
	# use Exporter();
97
98
	$CGI::Buffer::buf = IO::String->new();
99 1
	$CGI::Buffer::old_buf = select($CGI::Buffer::buf);
100
101
	if((!defined($ENV{'SERVER_PROTOCOL'})) ||
102
	  ($ENV{'SERVER_PROTOCOL'} eq 'HTTP/1.0')) {
103
	  	$generate_etag = 0;
104
	}
105
}
106
107
END {
108
	if(defined($^V) && ($^V ge 'v5.14.0')) {
109
		return if ${^GLOBAL_PHASE} eq 'DESTRUCT';	# >= 5.14.0 only
110
	}
111
112
	if($logger) {
113
		if($ENV{'HTTP_IF_NONE_MATCH'}) {
114
			$logger->debug("HTTP_IF_NONE_MATCH: $ENV{HTTP_IF_NONE_MATCH}");
115
		}
116
		if($ENV{'HTTP_IF_MODIFIED_SINCE'}) {
117
			$logger->debug("HTTP_IF_MODIFIED_SINCE: $ENV{HTTP_IF_MODIFIED_SINCE}");
118
		}
119
		$logger->debug("Generate_etag = $generate_etag", "Generate_304 = $generate_304",
120
			"Generate_last_modified = $generate_last_modified");
121
122
		# This will cause everything to get flushed and prevent
123
		# outputs to the logger.  We need to do that now since
124
		# if we leave it to Perl to delete later we may get
125
		# a mesage that Log4Perl::init() hasn't been called
126
		$logger = undef;
127
	}
128 1
	select($CGI::Buffer::old_buf);
129
	my $pos = $CGI::Buffer::buf->getpos;
130
	$CGI::Buffer::buf->setpos(0);
131
	read($CGI::Buffer::buf, $buf, $pos);
132
	($headers, $body) = split /\r?\n\r?\n/, $buf, 2;
133
134
	unless($headers || is_cached()) {
135
		if($logger) {
136
			$logger->debug('There was no output');
137
		}
138
		return;
139
	}
140
	if($ENV{'REQUEST_METHOD'} && ($ENV{'REQUEST_METHOD'} eq 'HEAD')) {
141
		$send_body = 0;
142
	} else {
143
		$send_body = 1;
144
	}
145
146
	if($headers) {
147
		_set_content_type($headers);
148
	}
149
150
	if(defined($body) && ($body eq '')) {
151
		# E.g. if header of Location is given with no body, for
152
		#	redirection
153
		$body = undef;
154
		if($cache) {
155
			# Don't try to retrieve it below from the cache
156
			$send_body = 0;
157
		}
158
	} elsif(defined($content_type[0]) && (lc($content_type[0]) eq 'text') && (lc($content_type[1]) =~ /^html/) && defined($body)) {
159
		if($optimise_content) {
160
			# require HTML::Clean;
161
			require HTML::Packer;	# Overkill using HTML::Clean and HTML::Packer...
162
163
			if($logger) {
164
				$logger->trace('Packer');
165
			}
166
167
			my $oldlength = length($body);
168
			my $newlength;
169
170
			if($optimise_content == 1) {
171
				_optimise_content();
172
			} else {
173
				while(1) {
174
					_optimise_content();
175
					$newlength = length($body);
176
					last if ($newlength >= $oldlength);
177
					$oldlength = $newlength;
178
				}
179
			}
180
181
			# If we're on http://www.example.com and have a link
182
			# to http://www.example.com/foo/bar.htm, change the
183
			# link to /foo/bar.htm - there's no need to include
184
			# the site name in the link
185
			unless(defined($info)) {
186
				if($cache) {
187
					$info = CGI::Info->new({ cache => $cache });
188
				} else {
189
					$info = CGI::Info->new();
190
				}
191
			}
192
193
			my $href = $info->host_name();
194
			my $protocol = $info->protocol();
195
196
			unless($protocol) {
197
				$protocol = 'http';
198
			}
199
200
			$body =~ s/<a\s+?href="$protocol:\/\/$href"/<a href="\/"/gim;
201
			$body =~ s/<a\s+?href="$protocol:\/\/$href/<a href="/gim;
202
203
			# TODO use URI->path_segments to change links in
204
			# /aa/bb/cc/dd.htm which point to /aa/bb/ff.htm to
205
			# ../ff.htm
206
207
			# TODO: <img border=0 src=...>
208
			$body =~ s/<img\s+?src="$protocol:\/\/$href"/<img src="\/"/gim;
209
			$body =~ s/<img\s+?src="$protocol:\/\/$href/<img src="/gim;
210
211
			# Don't use HTML::Clean because of RT402
212
			# my $h = new HTML::Clean(\$body);
213
			# # $h->compat();
214
			# $h->strip();
215
			# my $ref = $h->data();
216
217
			# Don't always do javascript 'best' since it's confused
218
			# by the common <!-- HIDE technique.
219
			# See https://github.com/nevesenin/javascript-packer-perl/issues/1#issuecomment-4356790
220
			my $options = {
221
				remove_comments => 1,
222
				remove_newlines => 0,
223
				do_stylesheet => 'minify'
224
			};
225
			if($optimise_content >= 2) {
226
				$options->{do_javascript} = 'best';
227
				$body =~ s/(<script.*?>)\s*<!--/$1/gi;
228
				$body =~ s/\/\/-->\s*<\/script>/<\/script>/gi;
229
				$body =~ s/(<script.*?>)\s+/$1/gi;
230
			}
231
			$body = HTML::Packer->init()->minify(\$body, $options);
232
			if($optimise_content >= 2) {
233
				# Change document.write("a"); document.write("b")
234
				# into document.write("a"+"b");
235
				while(1) {
236
					$body =~ s/<script\s*?type\s*?=\s*?"text\/javascript"\s*?>(.*?)document\.write\((.+?)\);\s*?document\.write\((.+?)\)/<script type="text\/JavaScript">${1}document.write($2+$3)/igs;
237
					$newlength = length($body);
238
					last if ($newlength >= $oldlength);
239
					$oldlength = $newlength;
240
				}
241
			}
242
		}
243
		if($lint_content) {
244
			require HTML::Lint;
245
			HTML::Lint->import;
246
247
			if($logger) {
248
				$logger->trace('Lint');
249
			}
250
			my $lint = HTML::Lint->new();
251
			$lint->parse($body);
252
			$lint->eof();
253
254
			if($lint->errors) {
255
				$headers = 'Status: 500 Internal Server Error';
256
				@o = ('Content-type: text/plain');
257
				$body = '';
258
				foreach my $error ($lint->errors) {
259
					my $errtext = $error->where() . ': ' . $error->errtext() . "\n";
260
					warn($errtext);
261
					$body .= $errtext;
262
				}
263
			}
264
		}
265
	}
266
267
	if(defined($headers) && ($headers =~ /^Status: (\d+)/m)) {
268
		$status = $1;
269
	} elsif($info) {
270
		$status = $info->status();
271
	} else {
272
		$status = 200;
273
	}
274
275
	if($logger) {
276
		$logger->debug("Initial status = $status");
277
	}
278
279
	# Generate the eTag before compressing, since the compressed data
280
	# includes the mtime field which changes thus causing a different
281
	# Etag to be generated
282
	if($ENV{'SERVER_PROTOCOL'} &&
283
	  ($ENV{'SERVER_PROTOCOL'} eq 'HTTP/1.1') &&
284
	  $generate_etag && defined($body)) {
285
		# encode to avoid "Wide character in subroutine entry"
286
		require Encode;
287
		$encode_loaded = 1;
288
		$etag = '"' . Digest::MD5->new->add(Encode::encode_utf8($body))->hexdigest() . '"';
289
		if($ENV{'HTTP_IF_NONE_MATCH'} && $generate_304 && ($status == 200)) {
290
			if($logger) {
291
				$logger->debug("Compare $ENV{HTTP_IF_NONE_MATCH} with $etag");
292
			}
293
			if($ENV{'HTTP_IF_NONE_MATCH'} eq $etag) {
294
				push @o, "Status: 304 Not Modified";
295
				$send_body = 0;
296
				$status = 304;
297
				if($logger) {
298
					$logger->debug('Set status to 304');
299
				}
300
			} elsif($logger) {
301
				$logger->debug(diff(\$body, \$cache->get(_generate_key())));
302
			}
303
		}
304
	}
305
306
	my $encoding = _should_gzip();
307
	my $unzipped_body = $body;
308
309
	if(defined($unzipped_body)) {
310
		my $range = $ENV{'Range'} ? $ENV{'Range'} : $ENV{'HTTP_RANGE'};
311
312
		if($range && !$cache) {
313
			# TODO: Partials
314
			if($range =~ /^bytes=(\d*)-(\d*)/) {
315
				if($1 && $2) {
316
					$body = substr($body, $1, $2-$1);
317
				} elsif($1) {
318
					$body = substr($body, $1);
319
				} elsif($2) {
320
					$body = substr($body, 0, $2);
321
				}
322
				$unzipped_body = $body;
323
				$status = 206;
324
			}
325
		}
326
		_compress({ encoding => $encoding });
327
	}
328
329
	if($cache) {
330
		require Storable;
331
332
		my $cache_hash;
333
		my $key = _generate_key();
334
335
		# Cache unzipped version
336
		if(!defined($body)) {
337
			if($send_body) {
338
				$cobject = $cache->get_object($key);
339
				if(defined($cobject)) {
340
					$cache_hash = Storable::thaw($cobject->value());
341
					$headers = $cache_hash->{'headers'};
342
					_set_content_type($headers);
343
					@o = ("X-CGI-Buffer-$VERSION: Hit");
344
					if($info) {
345
						my $host_name = $info->host_name();
346
						push @o, "X-Cache: HIT from $host_name";
347
						push @o, "X-Cache-Lookup: HIT from $host_name";
348
					} else {
349
						push @o, 'X-Cache: HIT';
350
						push @o, 'X-Cache-Lookup: HIT';
351
					}
352
				} else {
353
					carp(__PACKAGE__, ": error retrieving data for key $key");
354
				}
355
			}
356
357
			# Nothing has been output yet, so we can check if it's
358
			# OK to send 304 if possible
359
			if($send_body && $ENV{'SERVER_PROTOCOL'} &&
360
			  ($ENV{'SERVER_PROTOCOL'} eq 'HTTP/1.1') &&
361
			  $generate_304 && ($status == 200)) {
362
				if($ENV{'HTTP_IF_MODIFIED_SINCE'}) {
363
					_check_modified_since({
364
						since => $ENV{'HTTP_IF_MODIFIED_SINCE'},
365
						modified => $cobject->created_at()
366
					});
367
				}
368
			}
369
			if($send_body && ($status == 200)) {
370
				$body = $cache_hash->{'body'};
371
				if(!defined($body)) {
372
					# Panic
373
					$headers = 'Status: 500 Internal Server Error';
374
					@o = ('Content-type: text/plain');
375
					$body = "Can't retrieve body for key $key, cache_hash contains:\n";
376
					foreach my $k (keys %{$cache_hash}) {
377
						$body .= "\t$k\n";
378
					}
379
					$cache->remove($key);
380
					if($logger) {
381
						$logger->error("Can't retrieve body for key $key");
382
					} else {
383
						carp "Can't retrieve body for key $key";
384
					}
385
					warn($body);
386
					$send_body = 0;
387
					$status = 500;
388
				}
389
			}
390
			if($send_body && $ENV{'SERVER_PROTOCOL'} &&
391
			  ($ENV{'SERVER_PROTOCOL'} eq 'HTTP/1.1') &&
392
			  ($status == 200)) {
393
				if($ENV{'HTTP_IF_NONE_MATCH'}) {
394
					if(!defined($etag)) {
395 1
						unless($encode_loaded) {
396
							require Encode;
397
							$encode_loaded = 1;
398
						}
399
						$etag = '"' . Digest::MD5->new->add(Encode::encode_utf8($body))->hexdigest() . '"';
400
					}
401
					if($logger && $generate_304) {
402
						$logger->debug("Compare etags $ENV{HTTP_IF_NONE_MATCH} and $etag");
403
					}
404
					if(($ENV{'HTTP_IF_NONE_MATCH'} eq $etag) && $generate_304) {
405
						push @o, "Status: 304 Not Modified";
406
						$status = 304;
407
						$send_body = 0;
408 1
						if($logger) {
409
							$logger->debug('Set status to 304');
410
						}
411
					}
412
				}
413
			}
414
			if($status == 200) {
415
				$encoding = _should_gzip();
416
				if($send_body) {
417
					if($generate_etag && !defined($etag) && ((!defined($headers)) || ($headers !~ /^ETag: /m))) {
418
						$etag = '"' . Digest::MD5->new->add(Encode::encode_utf8($body))->hexdigest() . '"';
419
					}
420
					_compress({ encoding => $encoding });
421
				}
422
			}
423
			my $cannot_304 = !$generate_304;
424
			unless($etag) {
425
				if(defined($headers) && ($headers =~ /^ETag: "([a-z0-9]{32})"/m)) {
426
					$etag = $1;
427
				} else {
428
					$etag = $cache_hash->{'etag'};
429
				}
430
			}
431
			if($ENV{'HTTP_IF_NONE_MATCH'} && $send_body && ($status != 304) && $generate_304) {
432
				if($logger) {
433
					$logger->debug("Compare $ENV{HTTP_IF_NONE_MATCH} with $etag");
434
				}
435
				if(defined($etag) && ($etag eq $ENV{'HTTP_IF_NONE_MATCH'}) && ($status == 200)) {
436
					push @o, "Status: 304 Not Modified";
437
					$send_body = 0;
438
					$status = 304;
439
					if($logger) {
440
						$logger->debug('Set status to 304');
441
					}
442
				} else {
443
					$cannot_304 = 1;
444
				}
445
			}
446
			if($cobject) {
447
				if($ENV{'HTTP_IF_MODIFIED_SINCE'} && ($status != 304) && !$cannot_304) {
448
					_check_modified_since({
449
						since => $ENV{'HTTP_IF_MODIFIED_SINCE'},
450
						modified => $cobject->created_at()
451
					});
452
				}
453
				if(($status == 200) && $generate_last_modified) {
454
					if($logger) {
455
						$logger->debug('Set Last-Modified to ', HTTP::Date::time2str($cobject->created_at()));
456
					}
457
					push @o, "Last-Modified: " . HTTP::Date::time2str($cobject->created_at());
458
				}
459
			}
460
		} else {
461
			# Not in the server side cache
462
			if($status == 200) {
463
				unless($cache_age) {
464
					# It would be great if CHI::set()
465
					# allowed the time to be 'lru' for least
466
					# recently used.
467
					$cache_age = '10 minutes';
468
				}
469
				$cache_hash->{'body'} = $unzipped_body;
470
				if(@o && defined($o[0])) {
471
					# Remember, we're storing the UNzipped
472
					# version in the cache
473
					my $c;
474
					if(defined($headers) && length($headers)) {
475
						$c = $headers . "\r\n" . join("\r\n", @o);
476
					} else {
477
						$c = join("\r\n", @o);
478
					}
479
					$c =~ s/^Content-Encoding: .+$//mg;
480
					$c =~ s/^Vary: Accept-Encoding.*\r?$//mg;
481
					$c =~ s/\n+/\n/gs;
482
					if(length($c)) {
483
						$cache_hash->{'headers'} = $c;
484
					}
485
				} elsif(defined($headers) && length($headers)) {
486
					$headers =~ s/^Content-Encoding: .+$//mg;
487
					$headers =~ s/^Vary: Accept-Encoding.*\r?$//mg;
488
					$headers =~ s/\n+/\n/gs;
489
					if(length($headers)) {
490
						$cache_hash->{'headers'} = $headers;
491
					}
492
				}
493
				if($generate_etag && defined($etag)) {
494
					$cache_hash->{'etag'} = $etag;
495
				}
496
				# TODO: Support the Expires header
497
				# if($headers !~ /^Expires: /m))) {
498
				# }
499
				if($logger) {
500
					$logger->debug("Store $key in the cache, age = $cache_age ", length($cache_hash->{'body'}), ' bytes');
501
				}
502
				$cache->set($key, Storable::freeze($cache_hash), $cache_age);
503
				if($generate_last_modified) {
504
					$cobject = $cache->get_object($key);
505
					if(defined($cobject)) {
506
						push @o, "Last-Modified: " . HTTP::Date::time2str($cobject->created_at());
507
					} else {
508
						push @o, "Last-Modified: " . HTTP::Date::time2str(time);
509
					}
510
				}
511
			}
512
			if($info) {
513
				my $host_name = $info->host_name();
514
				if(defined($x_cache)) {
515
					push @o, "X-Cache: $x_cache from $host_name";
516
				} else {
517
					push @o, "X-Cache: MISS from $host_name";
518
				}
519
				push @o, "X-Cache-Lookup: MISS from $host_name";
520
			} else {
521
				if(defined($x_cache)) {
522
					push @o, "X-Cache: $x_cache";
523
				} else {
524
					push @o, 'X-Cache: MISS';
525
				}
526
				push @o, 'X-Cache-Lookup: MISS';
527
			}
528
			push @o, "X-CGI-Buffer-$VERSION: Miss";
529
		}
530
		# We don't need it any more, so give Perl a chance to
531
		# tidy it up seeing as we're in the destructor
532
		$cache = undef;
533
	} elsif($info) {
534
		my $host_name = $info->host_name();
535
		push @o, ("X-Cache: MISS from $host_name", "X-Cache-Lookup: MISS from $host_name");
536
		if($generate_last_modified) {
537
			if(my $age = _my_age()) {
538
				push @o, 'Last-Modified: ' . HTTP::Date::time2str($age);
539
			}
540
		}
541
		if($ENV{'HTTP_IF_MODIFIED_SINCE'} && ($status != 304) && $generate_304) {
542
			_check_modified_since({
543
				since => $ENV{'HTTP_IF_MODIFIED_SINCE'},
544
				modified => _my_age()
545
			});
546
		}
547
	} else {
548
		push @o, ('X-Cache: MISS', 'X-Cache-Lookup: MISS');
549
	}
550
	if($generate_etag && ((!defined($headers)) || ($headers !~ /^ETag: /m))) {
551
		if(defined($etag)) {
552
			push @o, "ETag: $etag";
553
			if($logger) {
554
				$logger->debug("Set ETag to $etag");
555
			}
556
		} elsif($logger && (($status == 200) || $status == 304) && $body && !is_cached()) {
557
			$logger->warn("BUG: ETag not generated, status $status");
558
		}
559
	}
560
561
	my $body_length;
562
	if(defined($body)) {
563
		if(utf8::is_utf8($body)) {
564
			utf8::encode($body);
565
		}
566
		$body_length = length($body);
567
	} else {
568
		$body_length = 0;
569
	}
570
571
	if(defined($headers) && length($headers)) {
572
		# Put the original headers first, then those generated within
573
		# CGI::Buffer
574
		unshift @o, split(/\r\n/, $headers);
575
		if($body && $send_body) {
576 2
			if(scalar(grep(/^Content-Length: \d/, @o)) == 0) {
577
				push @o, "Content-Length: $body_length";
578
			}
579
		}
580 2
		if(scalar(grep(/^Status: \d/, @o)) == 0) {
581
			require HTTP::Status;
582
			HTTP::Status->import();
583
584
			push @o, "Status: $status " . HTTP::Status::status_message($status);
585
		}
586
	} else {
587
		push @o, "X-CGI-Buffer-$VERSION: No headers";
588
	}
589
590
	if($body_length && $send_body) {
591
		push @o, ('', $body);
592
	}
593
594
	# XXXXXXXXXXXXXXXXXXXXXXX
595
	if(0) {
596
		# This code helps to debug Wide character prints
597
		my $wideCharWarningsIssued = 0;
598
		my $widemess;
599
		$SIG{__WARN__} = sub {
600
			$wideCharWarningsIssued += "@_" =~ /Wide character in .../;
601
			$widemess = "@_";
602
			if($logger) {
603
				$logger->fatal($widemess);
604
				my $i = 1;
605
				$logger->trace('Stack Trace');
606
				while((my @call_details = (caller($i++)))) {
607
					$logger->trace($call_details[1] . ':' . $call_details[2] . ' in function ' . $call_details[3]);
608
				}
609
			}
610
			CORE::warn(@_);     # call the builtin warn as usual
611
		};
612
613
		if(scalar @o) {
614
			print join("\r\n", @o);
615
			if($wideCharWarningsIssued) {
616
				my $mess = join("\r\n", @o);
617
				$mess =~ /[^\x00-\xFF]/;
618
				open(my $fout, '>>', '/tmp/NJH');
619
				print $fout "$widemess:\n";
620
				print $fout $mess;
621
				print $fout 'x' x 40, "\n";
622
				close $fout;
623
			}
624
		}
625
	} elsif(scalar @o) {
626
		print join("\r\n", @o);
627
	}
628
	# XXXXXXXXXXXXXXXXXXXXXXX
629
630
	if((!$send_body) || !defined($body)) {
631
		print "\r\n\r\n";
632
	}
633
}
634
635
sub _check_modified_since {
636
	if($logger) {
637
		$logger->trace('In _check_modified_since');
638
	}
639
640
	if(!$generate_304) {
641
		return;
642
	}
643
	my $params = shift;
644
645
	if(!defined($$params{since})) {
646
		return;
647
	}
648
	my $s = HTTP::Date::str2time($$params{since});
649
	if(!defined($s)) {
650
		# IF_MODIFIED_SINCE isn't a valid data
651
		return;
652
	}
653
654
	my $age = _my_age();
655
	if(!defined($age)) {
656
		return;
657
	}
658
	if($age > $s) {
659
		if($logger) {
660
			$logger->debug('_check_modified_since: script has been modified');
661
		}
662
		# Script has been updated so it may produce different output
663
		return;
664
	}
665
666
	if($logger) {
667
		$logger->debug("_check_modified_since: Compare $$params{modified} with $s");
668
	}
669
	if($$params{modified} <= $s) {
670
		push @o, "Status: 304 Not Modified";
671
		$status = 304;
672
		$send_body = 0;
673
		if($logger) {
674
			$logger->debug('Set status to 304');
675
		}
676
	}
677
}
678
679
# Reduce output, e.g. remove superfluous white-space.
680
sub _optimise_content {
681
	# FIXME: regex bad, HTML parser good
682
	# Regexp::List - wow!
683
	$body =~ s/(((\s+|\r)\n|\n(\s+|\+)))/\n/g;
684
	# $body =~ s/\r\n/\n/gs;
685
	# $body =~ s/\s+\n/\n/gs;
686
	# $body =~ s/\n+/\n/gs;
687
	# $body =~ s/\n\s+|\s+\n/\n/g;
688
	$body =~ s/\<\/div\>\s+\<div/\<\/div\>\<div/gis;
689
	# $body =~ s/\<\/p\>\s\<\/div/\<\/p\>\<\/div/gis;
690
	# $body =~ s/\<div\>\s+/\<div\>/gis;	# Remove spaces after <div>
691
	$body =~ s/(<div>\s+|\s+<div>)/<div>/gis;
692
	$body =~ s/\s+<\/div\>/\<\/div\>/gis;	# Remove spaces before </div>
693
	$body =~ s/\s+\<p\>|\<p\>\s+/\<p\>/im;  # TODO <p class=
694
	$body =~ s/\s+\<\/p\>|\<\/p\>\s+/\<\/p\>/gis;
695
	$body =~ s/<html>\s+<head>/<html><head>/is;
696
	$body =~ s/\s*<\/head>\s+<body>\s*/<\/head><body>/is;
697
	$body =~ s/<html>\s+<body>/<html><body>/is;
698
	$body =~ s/<body>\s+/<body>/is;
699
	$body =~ s/\s+\<\/html/\<\/html/is;
700
	$body =~ s/\s+\<\/body/\<\/body/is;
701
	$body =~ s/\s(\<.+?\>\s\<.+?\>)/$1/;
702
	# $body =~ s/(\<.+?\>\s\<.+?\>)\s/$1/g;
703
	$body =~ s/\<p\>\s/\<p\>/gi;
704
	$body =~ s/\<\/p\>\s\<p\>/\<\/p\>\<p\>/gi;
705
	$body =~ s/\<\/tr\>\s\<tr\>/\<\/tr\>\<tr\>/gi;
706
	$body =~ s/\<\/td\>\s\<\/tr\>/\<\/td\>\<\/tr\>/gi;
707
	$body =~ s/\<\/td\>\s*\<td\>/\<\/td\>\<td\>/gis;
708
	$body =~ s/\<\/tr\>\s\<\/table\>/\<\/tr\>\<\/table\>/gi;
709
	$body =~ s/\<br\s?\/?\>\s?\<p\>/\<p\>/gi;
710
	$body =~ s/\<br\>\s/\<br\>/gi;
711
	$body =~ s/\s+\<br/\<br/gi;
712
	$body =~ s/\<br\s?\/\>\s/\<br \/\>/gi;
713
	$body =~ s/[ \t]+/ /gs;	# Remove duplicate space, don't use \s+ it breaks JavaScript
714
	$body =~ s/\s\<p\>/\<p\>/gi;
715
	$body =~ s/\s\<script/\<script/gi;
716
	$body =~ s/(<script>\s|\s<script>)/<script>/gis;
717
	$body =~ s/(<\/script>\s|\s<\/script>)/<\/script>/gis;
718
	$body =~ s/\<td\>\s/\<td\>/gi;
719
	$body =~ s/\s+\<a\shref="(.+?)"\>\s?/ <a href="$1">/gis;
720
	$body =~ s/\s?<a\shref=\s"(.+?)"\>/ <a href="$1">/gis;
721
	$body =~ s/\s+<\/a\>\s+/<\/a> /gis;
722
	$body =~ s/(\s?<hr>\s+|\s+<hr>\s?)/<hr>/gis;
723
	# $body =~ s/\s<hr>/<hr>/gis;
724
	# $body =~ s/<hr>\s/<hr>/gis;
725
	$body =~ s/<\/li>\s+<li>/<\/li><li>/gis;
726
	$body =~ s/<\/li>\s+<\/ul>/<\/li><\/ul>/gis;
727
	$body =~ s/<ul>\s+<li>/<ul><li>/gis;
728
	$body =~ s/\s+<\/li>/<\/li>/gis;
729
	$body =~ s/\<\/option\>\s+\<option/\<\/option\>\<option/gis;
730
	$body =~ s/<title>\s*(.+?)\s*<\/title>/<title>$1<\/title>/is;
731
	$body =~ s/<\/center>\s+<center>/ /gis;
732
}
733
734
# Create a key for the cache
735
sub _generate_key {
736
	if($cache_key) {
737
		return $cache_key;
738
	}
739
	unless(defined($info)) {
740
		$info = CGI::Info->new({ cache => $cache });
741
	}
742
743
	my $key = $info->browser_type() . '::' . $info->domain_name() . '::' . $info->script_name() . '::' . $info->as_string();
744
	if($lingua) {
745
		$key .= '::' . $lingua->language();
746
	}
747
	if($ENV{'HTTP_COOKIE'}) {
748
		# Different states of the client are stored in different caches
749
		# Don't put different Google Analytics in different caches, and anyway they
750
		# would be wrong
751
		foreach my $cookie(split(/;/, $ENV{'HTTP_COOKIE'})) {
752
			unless($cookie =~ /^__utm[abcz]/) {
753
				$key .= "::$cookie";
754
			}
755
		}
756
	}
757
758
	# Honour the Vary headers
759
	if($headers && ($headers =~ /^Vary: .*$/m)) {
760
		if(defined($logger)) {
761
			$logger->debug('Found Vary header');
762
		}
763
		foreach my $h1(split(/\r?\n/, $headers)) {
764
			my ($h1_name, $h1_value) = split /\:\s*/, $h1, 2;
765
			if(lc($h1_name) eq 'vary') {
766
				foreach my $h2(split(/\r?\n/, $headers)) {
767
					my ($h2_name, $h2_value) = split /\:\s*/, $h2, 2;
768
					if($h2_name eq $h1_value) {
769
						$key .= '::' . $h2_value;
770
						last;
771
					}
772
				}
773
			}
774
		}
775
	}
776
	$key =~ s/\//::/g;
777
	$key =~ s/::::/::/g;
778
	$key =~ s/::$//;
779
	if(defined($logger)) {
780
		$logger->trace("Returning $key");
781
	}
782
	$cache_key = $key;
783
	return $key;
784
}
785
786
=head2 init
787
788
Set various options and override default values.
789
790
    # Put this toward the top of your program before you do anything
791
    # By default, generate_tag, generate_304 and compress_content are ON,
792
    # optimise_content and lint_content are OFF.  Set optimise_content to 2 to
793
    # do aggressive JavaScript optimisations which may fail.
794
    use CGI::Buffer;
795
    CGI::Buffer::init(
796
	generate_etag => 1,	# make good use of client's cache
797
	generate_last_modified => 1,	# more use of client's cache
798
	compress_content => 1,	# if gzip the output
799
	optimise_content => 0,	# optimise your program's HTML, CSS and JavaScript
800
	cache => CHI->new(driver => 'File'),	# cache requests
801
	cache_key => 'string',	# key for the cache
802
	cache_age => '10 minutes',	# how long to store responses in the cache
803
	logger => $logger,
804
	lint_content => 0,	# Pass through HTML::Lint
805
	generate_304 => 1,	# Generate 304: Not modified
806
	lingua => CGI::Lingua->new(),
807
    );
808
809
If no cache_key is given, one will be generated which may not be unique.
810
The cache_key should be a unique value dependent upon the values set by the
811
browser.
812
813
The cache object will be an object that understands get_object(),
814
set(), remove() and created_at() messages, such as an L<CHI> object. It is
815
used as a server-side cache to reduce the need to rerun database accesses.
816
817
Items stay in the server-side cache by default for 10 minutes.
818
This can be overridden by the cache_control HTTP header in the request, and
819
the default can be changed by the cache_age argument to init().
820
821
Logger will be an object that understands debug() such as an L<Log::Log4perl>
822
object.
823
824
To generate a last_modified header, you must give a cache object.
825
826
Init allows a reference of the options to be passed. So both of these work:
827
    use CGI::Buffer;
828
    #...
829
    CGI::Buffer::init(generate_etag => 1);
830
    CGI::Buffer::init({ generate_etag => 1, info => CGI::Info->new() });
831
832
Generally speaking, passing by reference is better since it copies less on to
833
the stack.
834
835
Alternatively you can give the options when loading the package:
836
    use CGI::Buffer { optimise_content => 1 };
837
838
=cut
839
840
sub init {
841
	my %params = (ref($_[0]) eq 'HASH') ? %{$_[0]} : @_;
842
843
	# Safe options - can be called at any time
844
	if(defined($params{generate_etag})) {
845
		$generate_etag = $params{generate_etag};
846
	}
847
	if(defined($params{generate_last_modified})) {
848
		$generate_last_modified = $params{generate_last_modified};
849
	}
850
	if(defined($params{compress_content})) {
851
		$compress_content = $params{compress_content};
852
	}
853
	if(defined($params{optimise_content})) {
854
		$optimise_content = $params{optimise_content};
855
	}
856
	if(defined($params{lint_content})) {
857
		$lint_content = $params{lint_content};
858
	}
859
	if(defined($params{logger})) {
860
		$logger = $params{logger};
861
	}
862
	if(defined($params{lingua})) {
863
		$lingua = $params{lingua};
864
	}
865
	if(defined($params{generate_304})) {
866
		$generate_304 = $params{generate_304};
867
	}
868
	if(defined($params{info}) && (!defined($info))) {
869
		$info = $params{info};
870
	}
871
872
	# Unsafe options - must be called before output has been started
873
	my $pos = $CGI::Buffer::buf->getpos;
874
	if($pos > 0) {
875
		if(defined($logger)) {
876
			my @call_details = caller(0);
877
			$logger->warn("Too late to call init, $pos characters have been printed, caller line $call_details[2] of $call_details[1]");
878
		} else {
879
			# Must do Carp::carp instead of carp for Test::Carp
880
			Carp::carp "Too late to call init, $pos characters have been printed";
881
		}
882
	}
883
	if(defined($params{cache}) && can_cache()) {
884
		if(defined($ENV{'HTTP_CACHE_CONTROL'})) {
885
			my $control = $ENV{'HTTP_CACHE_CONTROL'};
886
			if(defined($logger)) {
887
				$logger->debug("cache_control = $control");
888
			}
889
			if($control =~ /^max-age\s*=\s*(\d+)$/) {
890
				# There is an argument not to do this
891
				# since one client will affect others
892
				$cache_age = "$1 seconds";
893
				if(defined($logger)) {
894
					$logger->debug("cache_age = $cache_age");
895
				}
896
			}
897
		}
898
		$cache_age ||= $params{cache_age};
899
900
		if((!defined($params{cache})) && defined($cache)) {
901
			if(defined($logger)) {
902
				if($cache_key) {
903
					$logger->debug("disabling cache $cache_key");
904
				} else {
905
					$logger->debug('disabling cache');
906
				}
907
			}
908
			$cache = undef;
909
		} else {
910
			$cache = $params{cache};
911
		}
912
		if(defined($params{cache_key})) {
913
			$cache_key = $params{cache_key};
914
		}
915
	}
916
}
917
918
sub import {
919
	# my $class = shift;
920
	shift;
921
922
	return unless @_;
923
924
	init(@_);
925
}
926
927
=head2 set_options
928
929
Synonym for init, kept for historical reasons.
930
931
=cut
932
933
sub set_options {
934
	my %params = (ref($_[0]) eq 'HASH') ? %{$_[0]} : @_;
935
936
	init(\%params);
937
}
938
939
=head2 can_cache
940
941
Returns true if the server is allowed to store the results locally.
942
943
=cut
944
945
sub can_cache {
946
	if(defined($x_cache)) {
947
		return ($x_cache eq 'HIT');
948
	}
949
950
	if(defined($ENV{'NO_CACHE'}) || defined($ENV{'NO_STORE'})) {
951
		$x_cache = 'MISS';
952
		return 0;
953
	}
954
	if(defined($ENV{'HTTP_CACHE_CONTROL'})) {
955
		my $control = $ENV{'HTTP_CACHE_CONTROL'};
956
		if(defined($logger)) {
957
			$logger->debug("cache_control = $control");
958
		}
959
		# TODO: check Authorization header not present
960
		if(($control eq 'no-store') ||
961
		       ($control eq 'no-cache') ||
962
		       ($control eq 'max-age=0') ||
963
		       ($control eq 'private')) {
964
			$x_cache = 'MISS';
965
			return 0;
966
		}
967
	}
968
	$x_cache = 'HIT';
969
	return 1;
970
}
971
972
=head2 is_cached
973
974
Returns true if the output is cached. If it is then it means that all of the
975
expensive routines in the CGI script can be by-passed because we already have
976
the result stored in the cache.
977
978
    # Put this toward the top of your program before you do anything
979
980
    # Example key generation - use whatever you want as something
981
    # unique for this call, so that subsequent calls with the same
982
    # values match something in the cache
983
    use CGI::Info;
984
    use CGI::Lingua;
985
    use CGI::Buffer;
986
987
    my $i = CGI::Info->new();
988
    my $l = CGI::Lingua->new(supported => ['en']);
989
990
    # To use server side caching you must give the cache argument, however
991
    # the cache_key argument is optional - if you don't give one then one will
992
    # be generated for you
993
    if(CGI::Buffer::can_cache()) {
994
        CGI::Buffer::init(
995
	    cache => CHI->new(driver => 'File'),
996
	    cache_key => $i->domain_name() . '/' . $i->script_name() . '/' . $i->as_string() . '/' . $l->language()
997
        );
998
        if(CGI::Buffer::is_cached()) {
999
	    # Output will be retrieved from the cache and sent automatically
1000
	    exit;
1001
        }
1002
    }
1003
    # Not in the cache, so now do our expensive computing to generate the
1004
    # results
1005
    print "Content-type: text/html\n";
1006
    # ...
1007
1008
=cut
1009
1010
sub is_cached {
1011
	unless($cache) {
1012
		if($logger) {
1013
			$logger->debug("is_cached: cache hasn't been enabled");
1014
		}
1015
		return 0;
1016
	}
1017
1018
	my $key = _generate_key();
1019
1020
	if($logger) {
1021
		$logger->debug("is_cached: looking for key = $key");
1022
	}
1023
	$cobject = $cache->get_object($key);
1024
	unless($cobject) {
1025
		if($logger) {
1026
			$logger->debug('not found in cache');
1027
		}
1028
		return 0;
1029
	}
1030
	unless($cobject->value($key)) {
1031
		if($logger) {
1032
			$logger->warn('is_cached: object is in the cache but not the data');
1033
		}
1034
		$cobject = undef;
1035
		return 0;
1036
	}
1037
1038
	# If the script has changed, don't use the cache since we may produce
1039
	# different output
1040
	my $age = _my_age();
1041
	unless(defined($age)) {
1042
		if($logger) {
1043
			$logger->debug("Can't determine script's age");
1044
		}
1045
		# Can't determine the age. Play it safe an assume we're not
1046
		# cached
1047
		$cobject = undef;
1048
		return 0;
1049
	}
1050
	if($age > $cobject->created_at()) {
1051
		# Script has been updated so it may produce different output
1052
		if($logger) {
1053
			$logger->debug('Script has been updated');
1054
		}
1055
		$cobject = undef;
1056
		# Nothing will be in date and all new searches would miss
1057
		# anyway, so may as well clear it all
1058
		# FIXME: RT104471
1059
		# $cache->clear();
1060
		return 0;
1061
	}
1062
	if($logger) {
1063
		$logger->debug('Script is in the cache');
1064
	}
1065
	return 1;
1066
}
1067
1068
sub _my_age {
1069
	if($script_mtime) {
1070
		return $script_mtime;
1071
	}
1072
	unless(defined($info)) {
1073
		if($cache) {
1074
			$info = CGI::Info->new({ cache => $cache });
1075
		} else {
1076
			$info = CGI::Info->new();
1077
		}
1078
	}
1079
1080
	my $path = $info->script_path();
1081
	unless(defined($path)) {
1082
		return;
1083
	}
1084
1085
	my @statb = stat($path);
1086
	$script_mtime = $statb[9];
1087
	return $script_mtime;
1088
}
1089
1090
sub _should_gzip
1091
{
1092
	if($compress_content && ($ENV{'HTTP_ACCEPT_ENCODING'} || $ENV{'HTTP_TE'})) {
1093
		if(scalar(@content_type)) {
1094
			if($content_type[0] ne 'text') {
1095
				return '';
1096
			}
1097
		}
1098
		my $accept = lc($ENV{'HTTP_ACCEPT_ENCODING'} ? $ENV{'HTTP_ACCEPT_ENCODING'} : $ENV{'HTTP_TE'});
1099
		foreach my $method(split(/,\s?/, $accept)) {
1100
			if(($method eq 'gzip') || ($method eq 'x-gzip') || ($method eq 'br')) {
1101
				return $method;
1102
			}
1103
		}
1104
	}
1105
1106
	return '';
1107
}
1108
1109
sub _set_content_type
1110
{
1111
	my $headers = shift;
1112
1113
	foreach my $header (split(/\r?\n/, $headers)) {
1114
		my ($header_name, $header_value) = split /\:\s*/, $header, 2;
1115
		if (lc($header_name) eq 'content-type') {
1116
			@content_type = split /\//, $header_value, 2;
1117
			last;
1118
		}
1119
	}
1120
}
1121
1122
sub _compress {
1123
	my %params = (ref($_[0]) eq 'HASH') ? %{$_[0]} : @_;
1124
1125
	my $encoding = $params{encoding};
1126
1127
	if((length($encoding) == 0) || (length($body) < MIN_GZIP_LEN)) {
1128
		return;
1129
	}
1130
1131
	if($encoding eq 'gzip') {
1132
		require Compress::Zlib;
1133
		Compress::Zlib->import;
1134
1135
		# Avoid 'Wide character in memGzip'
1136
		unless($encode_loaded) {
1137
			require Encode;
1138
			$encode_loaded = 1;
1139
		}
1140
		my $nbody = Compress::Zlib::memGzip(\Encode::encode_utf8($body));
1141
		if(length($nbody) < length($body)) {
1142
			$body = $nbody;
1143
			push @o, "Content-Encoding: $encoding";
1144
			push @o, "Vary: Accept-Encoding";
1145
		}
1146
	} elsif($encoding eq 'br') {
1147
		require IO::Compress::Brotli;
1148
		IO::Compress::Brotli->import();
1149
1150
		# Avoid 'Wide character in memGzip'
1151
		unless($encode_loaded) {
1152
			require Encode;
1153
			$encode_loaded = 1;
1154
		}
1155
		my $nbody = IO::Compress::Brotli::bro(Encode::encode_utf8($body));
1156
		if(length($nbody) < length($body)) {
1157
			$body = $nbody;
1158
			push @o, "Content-Encoding: $encoding";
1159
			push @o, "Vary: Accept-Encoding";
1160
		}
1161
	}
1162
}
1163
1164
=head1 AUTHOR
1165
1166
Nigel Horne, C<< <njh at bandsman.co.uk> >>
1167
1168
=head1 BUGS
1169
1170
CGI::Buffer should be safe even in scripts which produce lots of different
1171
output, e.g. e-commerce situations.
1172
On such pages, however, I strongly urge to setting generate_304 to 0 and
1173
sending the HTTP header "Cache-Control: no-cache".
1174
1175
When using L<Template>, ensure that you don't use it to output to STDOUT,
1176
instead you will need to capture into a variable and print that.
1177
For example:
1178
1179
    my $output;
1180
    $template->process($input, $vars, \$output) || ($output = $template->error());
1181
    print $output;
1182
1183
Can produce buggy JavaScript if you use the <!-- HIDING technique.
1184
This is a bug in L<JavaScript::Packer>, not CGI::Buffer.
1185
See https://github.com/nevesenin/javascript-packer-perl/issues/1#issuecomment-4356790
1186
1187
Mod_deflate can confuse this when compressing output.
1188
Ensure that deflation is off for .pl files:
1189
1190
    SetEnvIfNoCase Request_URI \.(?:gif|jpe?g|png|pl)$ no-gzip dont-vary
1191
1192
If you request compressed output then uncompressed output (or vice
1193
versa) on input that produces the same output, the status will be 304.
1194
The letter of the spec says that's wrong, so I'm noting it here, but
1195
in practice you should not see this happen or have any difficulties
1196
because of it.
1197
1198
CGI::Buffer is not compatible with FastCGI.
1199
1200
I advise adding CGI::Buffer as the last use statement so that it is
1201
cleared up first.  In particular it should be loaded after
1202
L<Log::Log4Perl>, if you're using that, so that any messages it
1203
produces are printed after the HTTP headers have been sent by
1204
CGI::Buffer;
1205
1206
CGI::Buffer is not compatible with FCGI, use L<FCGI::Buffer> instead.
1207
1208
Please report any bugs or feature requests to C<bug-cgi-buffer at rt.cpan.org>,
1209
or through the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CGI-Buffer>.
1210
I will be notified, and then you'll automatically be notified of progress on
1211
your bug as I make changes.
1212
1213
=head1 SEE ALSO
1214
1215
L<HTML::Packer>, L<HTML::Lint>
1216
1217
=head1 SUPPORT
1218
1219
You can find documentation for this module with the perldoc command.
1220
1221
    perldoc CGI::Buffer
1222
1223
You can also look for information at:
1224
1225
=over 4
1226
1227
=item * MetaCPAN
1228
1229
L<https://metacpan.org/release/CGI-Buffer>
1230
1231
=item * RT: CPAN's request tracker
1232
1233
L<https://rt.cpan.org/NoAuth/Bugs.html?Dist=CGI-Buffer>
1234
1235
=item * CPANTS
1236
1237
L<http://cpants.cpanauthors.org/dist/CGI-Buffer>
1238
1239
=item * CPAN Testers' Matrix
1240
1241
L<http://matrix.cpantesters.org/?dist=CGI-Buffer>
1242
1243
=item * CPAN Ratings
1244
1245
L<http://cpanratings.perl.org/d/CGI-Buffer>
1246
1247
=item * CPAN Testers Dependencies
1248
1249
L<http://deps.cpantesters.org/?module=CGI::Buffer>
1250
1251
=back
1252
1253
=head1 ACKNOWLEDGEMENTS
1254
1255
The inspiration and code for some of this is cgi_buffer by Mark
1256
Nottingham: L<https://www.mnot.net/blog/2003/04/24/etags>.
1257
1258
=head1 LICENSE AND COPYRIGHT
1259
1260
The licence for cgi_buffer is:
1261
1262
    "(c) 2000 Copyright Mark Nottingham <mnot@pobox.com>
1263
1264
    This software may be freely distributed, modified and used,
1265
    provided that this copyright notice remain intact.
1266
1267
    This software is provided 'as is' without warranty of any kind."
1268
1269
The rest of the program is Copyright 2011-2018 Nigel Horne,
1270
and is released under the following licence: GPL2
1271
1272
=cut
1273
1274
1; # End of CGI::Buffer