package FEAR::API::Document; use strict; use Carp; use Spiffy -base; use FEAR::API::Closure; use File::MMagic; use FEAR::API::SourceFilter; use Encode; use Text::Iconv; use MIME::Base64; use Digest::MD5 qw(md5); #use FEAR::API::Translate -base; _chain 'document'; _alias content => 'document'; _field 'title'; #_field 'type'; sub length { CORE::length($self->{document}); } _alias size => 'length'; sub as_string { $self->{document}; } sub digest { my $digest; if( Encode::is_utf8 $self->{document} ){ Encode::_utf8_off $self->{document}; $digest = md5 $self->{document}; Encode::_utf8_on $self->{document}; } else { $digest = md5 $self->{document}; } return $digest; } sub is_utf8 { Encode::is_utf8($self->{document}); } chain_sub utf8_on { Encode::_utf8_on($self->{document}); } chain_sub utf8_off { Encode::_utf8_off($self->{document}); } chain_sub iconv_from { my $converter = Text::Iconv->new((shift() || croak "please input encoding"), 'UTF-8') or croak $!; $self->{document} = $converter->convert($self->{document}); $self->utf8_on; } chain_sub append { $self->{document} .= shift; return $self; } chain_sub clear { $self->{document} = undef; return $self; } sub type { File::MMagic->new->checktype_contents($self->{document}); } # $self->diff(\$text); # $self->diff($filename); # $self->diff(\$text, \$output); # $self->diff($filename, $output_file); sub diff { } sub guess_encoding { } ###################################################################### # Compression / Decompression ###################################################################### use Compress::Zlib; chain_sub try_compress { if($self->{document}){ my $td = Compress::Zlib::memGzip $self->{document}; $self->{document} = $td if $td; } } _alias try_uncompress => 'try_decompress'; chain_sub try_decompress { if($self->{document}){ my $td = Compress::Zlib::memGunzip $self->{document}; $self->{document} = $td if $td; } } ###################################################################### # MIME methods ###################################################################### _alias mime_encode => 'MIME_encode'; sub MIME_encode { $self->utf8_off; $self->{document} = encode_base64 $self->{document}; } _alias mime_decode => 'MIME_decode'; sub MIME_decode { $self->{document} = decode_base64 $self->{document}; $self->utf8_on; } ###################################################################### # Testing methods ###################################################################### sub match { my $pattern = shift; $self->{document} =~ /$pattern/; } sub title_is { } sub title_like { } sub title_unlike { } sub content_is { $self->{document} eq shift; } sub content_contains { my $pattern = shift; $self->{document} =~ /$pattern/; } sub content_lacks { my $pattern = shift; $self->{document} !~ /$pattern/; } sub content_like { } sub content_unlike { } sub has_tag { } sub has_tag_like { } ###################################################################### # Shell-like methods ###################################################################### sub sort($&) { my $self = shift; my $code = shift; if(ref $code eq 'CODE'){ return join q//, map{"$_\n"} CORE::sort{$code->($_)} split /\n/, $self->{document}; } else { return join q//, map{"$_\n"} CORE::sort split /\n/, $self->{document}; } } sub map($&) { my $self = shift; my $code = shift; if(ref $code eq 'CODE'){ return join q//, map{"$_\n"} CORE::map{$code->($_)} split /\n/, $self->{document}; } elsif(ref $code eq 'Regexp') { return join q//, map{"$_\n"} CORE::map{m($code)} split /\n/, $self->{document}; } } sub grep($&) { my $self = shift; my $code = shift; if(ref $code eq 'CODE'){ return join q//, map{"$_\n"} CORE::grep{$code->($_)} split /\n/, $self->{document}; } elsif(ref $code eq 'Regexp') { return join q//, map{"$_\n"} CORE::grep{m($code)} split /\n/, $self->{document}; } } sub uniq { my %h; return join q//, map{"$_\n"} grep{Encode::_utf8_off $_; !$h{md5 $_}++} split /\n/, $self->{document}; } ######################################## # d is for destructive ######################################## chain_sub d_map { $self->{document} = $self->map(@_); } chain_sub d_grep { $self->{document} = $self->grep(@_); } chain_sub d_sort { $self->{document} = $self->sort(@_); } chain_sub d_uniq { $self->{document} = $self->uniq(@_); } ###################################################################### # XML-related methods ###################################################################### use File::Temp qw/ :POSIX /; use XML::XPath; use XML::XPath::XMLParser; _alias to_xhtml => 'html_to_xhtml'; chain_sub html_to_xhtml { local $/; open STDERR, ">/dev/null"; my ($fh, $file) = tmpnam(); print {$fh} $self->{document}; close $fh; $self->{document} = `tidy -q -utf8 -asxhtml -numeric -wrap 160 < $file`; unlink $file; } chain_sub xpath { my $t; my $xp = XML::XPath->new(xml => $self->{document}); my $cnt = 0; foreach (@_){ my $nodeset = $xp->find($_); foreach my $node ($nodeset->get_nodelist) { $t .= (@_ > 1 ? "[% PATH_$cnt $_ %]\n" : undef). XML::XPath::XMLParser::as_string($node). "\n". (@_ > 1 ? "[% END %]" : undef). "\n\n" ; $cnt++; } } $self->{document} = $t; } chain_sub hpath { $self->html_to_xhtml->xpath(@_); } ###################################################################### # Tokenization ###################################################################### our $InCJK = qr( \p{InCJKUnifiedIdeographs} | \p{InCJKUnifiedIdeographsExtensionA} | \p{InCJKUnifiedIdeographsExtensionB} | \p{InCJKCompatibilityForms} | \p{InCJKCompatibilityIdeographs} | \p{InCJKCompatibilityIdeographsSupplement} | \p{InCJKRadicalsSupplement} | \p{InCJKSymbolsAndPunctuation} | \p{InHiragana} | \p{InKatakana} | \p{InKatakanaPhoneticExtensions} | \p{InHangulCompatibilityJamo} | \p{InHangulJamo} | \p{InHangulSyllables} )x; sub tokens { my @tok; if($self->{document} =~ /[aiueo0-9]/io){ while($self->{document} =~ /([\p{Latin}\p{Number}]+)/go){ my $tok = lc $1; next unless $tok; push @tok, $tok; } } if($self->{document} =~ /(?:$InCJK)/o){ # Extract unigrams my @t; while($self->{document} =~ /($InCJK)/go){ my $t = $1; next unless length($t) == 1; push @t, $t; } for (my $i=0; $i<$#t; $i++){ my $t = $t[$i].$t[$i+1]; push @tok, $t; } @tok = (@t, @tok); } @tok; } 1; __END__ =head1 NAME FEAR::API::Document - Document class for FEAR::API =head1 DESCRIPTION This package is used to store documents fetched by FEAR::API::Agent. =head1 METHODS =head2 length(), size() Returns length of document =head2 as_string() Stringifies document object. =head2 digest() Returns MD5 digests of documents =head2 is_utf8 Checks if document is encoded in UTF-8 =head2 utf8_on(), utf8_off() Turns on/off utf8 flag. =head2 iconv_from() Tries to convert document to UTF-8 from any encoding =head2 append() Appends your input to document object. =head2 clear() Clears document object =head2 try_compress(), try_uncompress() Tries to compress/uncompress document. =head2 mime_encode(), mime_decode() Encode/Decode your document in MIME =head2 html_to_xhtml() Upgrades documents to XHTML =head2 xpath() Applies XPath to your document =head2 hpath() Upgrades documents to XHTML, and applies XPath =head2 tokens() Tokenizes documents =head1 AUTHOR & COPYRIGHT Copyright (C) 2006 by Yung-chung Lin (a.k.a. xern) This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself =cut