Committer: gariev
LJINT-454: Comments for side projects (global) - stage 2U trunk/cgi-bin/LJ/HTML/Metadata.pm
Modified: trunk/cgi-bin/LJ/HTML/Metadata.pm =================================================================== --- trunk/cgi-bin/LJ/HTML/Metadata.pm 2011-11-23 15:33:11 UTC (rev 20617) +++ trunk/cgi-bin/LJ/HTML/Metadata.pm 2011-11-23 15:36:56 UTC (rev 20618) @@ -98,7 +98,7 @@ return $self->_fetch_html; } - if ( $metadata_fields{$key} ) { + if ( exists $metadata_fields{$key} ) { $self->_extract_metadata; return Encode::encode_utf8( $self->SUPER::get($key) ); } @@ -177,8 +177,7 @@ if ( my $attrval = $attr->{ $rule->{'extract_attr'} } ) { $extracted_data->{ $rule->{'fill'} } ||= $attrval; } - - next; + return; } return; @@ -190,8 +189,26 @@ return if $self->{'_html_parsed'}; my $html = $self->html; + if ( ! Encode::is_utf8($html) ) { - $html = Encode::decode_utf8($html); + ## pass #1 - find the document encoding + my $encoding = "utf-8"; + { + my $parser = HTML::TokeParser->new( \$html ); + while (my $taginfo = $parser->get_tag('meta')) { + my $attr = $taginfo->[1]; + my $he = $attr->{'http-equiv'}; + if ($he && lc($he) eq 'content-type') { + my $content = $attr->{'content'}; + if ($content && $content =~ /charset=([\w\-]+)/) { + $encoding = $1; + } + last; + } + } + } + + $html = Encode::decode($encoding, $html); } my %extracted_data; @@ -218,7 +235,7 @@ } ); } } - + $self->title( $extracted_data{'og_title'} || $extracted_data{'meta_title'} || $extracted_data{'title'}