@@ -129,30 +129,32 @@ HTML::Parser - HTML parser class
129129
130130=head1 SYNOPSIS
131131
132- use strict;
133- use warnings;
134- use HTML::Parser ();
135-
136- # Create parser object
137- my $p = HTML::Parser->new(
138- api_version => 3,
139- start_h => [\&start, "tagname, attr"],
140- end_h => [\&end, "tagname"],
141- marked_sections => 1,
142- );
143-
144- # Parse document text chunk by chunk
145- $p->parse($chunk1);
146- $p->parse($chunk2);
147- # ...
148- # signal end of document
149- $p->eof;
150-
151- # Parse directly from file
152- $p->parse_file("foo.html");
153- # or
154- open(my $fh, "<:utf8", "foo.html") || die;
155- $p->parse_file($fh);
132+ use strict;
133+ use warnings;
134+ use HTML::Parser ();
135+
136+ # Create parser object
137+ my $p = HTML::Parser->new(
138+ api_version => 3,
139+ start_h => [\&start, "tagname, attr"],
140+ end_h => [\&end, "tagname"],
141+ marked_sections => 1,
142+ );
143+
144+ # Parse document text chunk by chunk
145+ $p->parse($chunk1);
146+ $p->parse($chunk2);
147+
148+ # ...
149+ # signal end of document
150+ $p->eof;
151+
152+ # Parse directly from file
153+ $p->parse_file("foo.html");
154+
155+ # or
156+ open(my $fh, "<:utf8", "foo.html") || die;
157+ $p->parse_file($fh);
156158
157159=head1 DESCRIPTION
158160
@@ -262,14 +264,14 @@ Parsing will also abort if one of the event handlers calls $p->eof.
262264
263265The effect of this is the same as:
264266
265- while (1) {
266- my $chunk = &$code_ref();
267- if (!defined($chunk) || !length($chunk)) {
268- $p->eof;
269- return $p;
267+ while (1) {
268+ my $chunk = &$code_ref();
269+ if (!defined($chunk) || !length($chunk)) {
270+ $p->eof;
271+ return $p;
272+ }
273+ $p->parse($chunk) || return undef;
270274 }
271- $p->parse($chunk) || return undef;
272- }
273275
274276But it is more efficient as this loop runs internally in XS code.
275277
@@ -988,24 +990,24 @@ HTML::Parser version 2 callback methods.
988990
989991This is equivalent to the following method calls:
990992
991- $p->handler(start => "start", "self, tagname, attr, attrseq, text");
992- $p->handler(end => "end", "self, tagname, text");
993- $p->handler(text => "text", "self, text, is_cdata");
994- $p->handler(process => "process", "self, token0, text");
995- $p->handler(
996- comment => sub {
997- my ($self, $tokens) = @_;
998- for (@$tokens) {$self->comment($_);}
999- },
1000- "self, tokens"
1001- );
1002- $p->handler(
1003- declaration => sub {
1004- my $self = shift;
1005- $self->declaration(substr($_[0], 2, -1));
1006- },
1007- "self, text"
1008- );
993+ $p->handler(start => "start", "self, tagname, attr, attrseq, text");
994+ $p->handler(end => "end", "self, tagname, text");
995+ $p->handler(text => "text", "self, text, is_cdata");
996+ $p->handler(process => "process", "self, token0, text");
997+ $p->handler(
998+ comment => sub {
999+ my ($self, $tokens) = @_;
1000+ for (@$tokens) { $self->comment($_); }
1001+ },
1002+ "self, tokens"
1003+ );
1004+ $p->handler(
1005+ declaration => sub {
1006+ my $self = shift;
1007+ $self->declaration(substr($_[0], 2, -1));
1008+ },
1009+ "self, text"
1010+ );
10091011
10101012Setting up these handlers can also be requested with the "api_version =>
101110132" constructor option.
@@ -1023,19 +1025,21 @@ The first simple example shows how you might strip out comments from
10231025an HTML document. We achieve this by setting up a comment handler that
10241026does nothing and a default handler that will print out anything else:
10251027
1026- use HTML::Parser;
1027- HTML::Parser->new(
1028- default_h => [sub { print shift }, 'text'],
1029- comment_h => [""],
1030- )->parse_file(shift || die) || die $!;
1028+ use HTML::Parser ();
1029+ HTML::Parser->new(
1030+ default_h => [sub { print shift }, 'text'],
1031+ comment_h => [""],
1032+ )->parse_file(shift || die)
1033+ || die $!;
10311034
10321035An alternative implementation is:
10331036
1034- use HTML::Parser;
1035- HTML::Parser->new(
1036- end_document_h => [sub { print shift }, 'skipped_text'],
1037- comment_h => [""],
1038- )->parse_file(shift || die) || die $!;
1037+ use HTML::Parser ();
1038+ HTML::Parser->new(
1039+ end_document_h => [sub { print shift }, 'skipped_text'],
1040+ comment_h => [""],
1041+ )->parse_file(shift || die)
1042+ || die $!;
10391043
10401044This will in most cases be much more efficient since only a single
10411045callback will be made.
@@ -1046,24 +1050,24 @@ handler. When it sees the title start tag it enables a text handler
10461050that prints any text found and an end handler that will terminate
10471051parsing as soon as the title end tag is seen:
10481052
1049- use HTML::Parser ();
1050-
1051- sub start_handler {
1052- return if shift ne "title";
1053- my $self = shift;
1054- $self->handler(text => sub { print shift }, "dtext");
1055- $self->handler(
1056- end => sub {
1057- shift->eof if shift eq "title";
1058- },
1059- "tagname,self"
1060- );
1061- }
1053+ use HTML::Parser ();
1054+
1055+ sub start_handler {
1056+ return if shift ne "title";
1057+ my $self = shift;
1058+ $self->handler(text => sub { print shift }, "dtext");
1059+ $self->handler(
1060+ end => sub {
1061+ shift->eof if shift eq "title";
1062+ },
1063+ "tagname,self"
1064+ );
1065+ }
10621066
1063- my $p = HTML::Parser->new(api_version => 3);
1064- $p->handler(start => \&start_handler, "tagname,self");
1065- $p->parse_file(shift || die) || die $!;
1066- print "\n";
1067+ my $p = HTML::Parser->new(api_version => 3);
1068+ $p->handler(start => \&start_handler, "tagname,self");
1069+ $p->parse_file(shift || die) || die $!;
1070+ print "\n";
10671071
10681072More examples are found in the F<eg/> directory of the C<HTML-Parser >
10691073distribution: the program C<hrefsub > shows how you can edit all links
0 commit comments