#!/usr/bin/perl

use strict;
use warnings;
use Data::Dump 'dump';
use List::Util qw(first);
use List::MoreUtils qw(pairwise minmax);

sub any { $_ && return 1 for @_; 0 }

my $filename = "42589_rv630_rrg_1.01o.ps";
my $page;

my %fonts = ( "F50683" => 1, "F4486" => 1, "F4487" => 1 );

my %bad_page = ( 86 => 1, 214 => 1, 332 => 1);

sub find_col_lines {
    my ($t, $v) = @_;
    my $min_y = $t->{left}{y} - 0.5;
    my $max_y = $t->{left}{y} + $t->{left}{h};
    my $min_x = $t->{left}{x};
    my $max_x = $t->{right}{x};
    my @col_lines = grep { $_->{x} >  $min_x and $_->{x} <  $max_x
                        and $_->{y} >= $min_y and $_->{y} <= $max_y } @$v;
    if (@col_lines) {
        $t->{col_top} = $col_lines[0]{y} + $col_lines[0]{h};
    }
    $t->{col_lines} = [ map $_->{x}, @col_lines ];
}

sub find_tables {
    my ($v, $page_num) = @_;
    my ($min_x, $max_x) = minmax(map $_->{x}, @$v);
    my @min = grep $_->{x} == $min_x, @$v;
    my @max = grep $_->{x} == $max_x, @$v;
    my $num = 0;
    my @tables = pairwise {{num=>$num++,left=>$a,right=>$b}} @min, @max;
    foreach my $t (@tables) {
        if ($bad_page{$page_num}) {
            $t->{col_lines} = [ 239.52, 280.02, 329.52 ];
            $t->{col_top} = 104.7 + 633.54;
        } else {
            find_col_lines($t, $v);
        }
    }
    return [grep $_->{col_top}, @tables];
}

sub find_table {
    my ($tables, $y) = @_;
    return first {
        my $l = $_->{left};
        $l->{y} < $y and $l->{y} + $l->{h} > $y
    } @$tables;
}

sub find_table_col {
    my ($t, $x, $y) = @_;

    $y > $t->{col_top} and return "heading";

    my $col = 0;
    foreach (@{$t->{col_lines}}) {
        last if $_ > $x;
        $col++;
    }

    return $col;
}

sub parse_page {
    my $p = shift;
    my $text = $p->{text};

    my @h = grep { $_->{w} > 20 and $_->{h} > 0.2 and $_->{h} < 0.8 } @{$p->{lines}};
    my @v = grep { $_->{w} < 0.9 and $_->{h} > 1 } @{$p->{lines}};

    @v or return;

    my $tables = find_tables(\@v, $p->{num});
    @$tables or return;
    $p->{tables} = $tables;

    my $prev;
    my $cur_text = '';
    my $cur_x;
    my $cur_font;
    foreach my $t (@$text) {
        my $x = $t->{x};
        my $y = $t->{y};
        undef $cur_x;
        shift @{$t->{lines}} eq "0 0 Td\n" or die;
        foreach (@{$t->{lines}}) {
            if (m{^/(F\d+)_0 1 Tf$}) {
                $cur_font = $1;
            }
            if (m{^\((.+)\) ([\d.]+) Tj$}) {
                if($fonts{$cur_font} and $1 ne "\\225") {
                    $cur_text .= $1;
                    defined $cur_x or $cur_x = $x;
                    $x += $2 * $t->{mul_x};
                    next;
                }
                $x += $2 * $t->{mul_x};
            }
            if (m{(-?[\d.]+) TJm$} and $fonts{$cur_font}) {
                my $tjm = -$1 * 0.01;
                $x += $tjm;
                if ($tjm < 1) {
                    next;
                } elsif ($tjm < 6.5) {
                    $cur_text .= ' ';
                    next;
                }
            }
            if (m{^(-?[\d.]+) (-?[\d.]+) Td$}) {
                $x = $t->{x} + $1 * $t->{mul_x};
                my $y2 = $t->{y} + $2 * $t->{mul_y};
                next if $y2 == $y;
                $y = $y2;
            }
            if (length $cur_text and $fonts{$cur_font}) {
                found_text($p, $cur_x, $y, $cur_text);
                $cur_text = '';
                undef $cur_x;
            }
        }
        if ($cur_text and $fonts{$cur_font}) {
            found_text($p, $cur_x, $y, $cur_text);
            $cur_text = '';
            undef $cur_x;
        }
    }
    return { num => $p->{num}, text => $p->{text2}};
}

sub found_text {
    my ($page, $x, $y, $text) = @_;
    my $table = find_table($page->{tables}, $y);
    my $col;
    if (defined $table) {
        $col = find_table_col($table, $x, $y);
    }
    $text =~ s/\\([()])/$1/g;
    my $i = {
        table   => (defined($table) ? $table->{num} : undef),
        col     => $col,
        text    => $text,
    };
    push @{$page->{text2}}, $i;
}

sub near {
    my ($a, $b) = @_;
    return abs($a - $b) < 1;
}

open my $fh, $filename or die "$filename: $!";
my (@text, $text, @lines, $page_num);
my @parsed;
while (<$fh>) {
    if ($_ eq "pdfEndPage\n") {
        $page_num > 344 and last;
        push @parsed, parse_page({
            num     => $page_num,
            lines   => \@lines,
            text    => \@text,
        });
        @lines = (); @text = ();
    }
    if (/^%%Page: (\d+) \d+/) {
        $page_num = $1;
        @text = ();
        next;
    }
    if (/^\[(.* [1-9]\d*(\.\d+)?)\] Tm$/) {
        $1 =~ /^  ([1-9]\d*(?:\.\d+)?)\ 0\ 0
                \ ([1-9]\d*(?:\.\d+)?)
                \ ([1-9]\d*(?:\.\d+)?)
                \ ([1-9]\d*(?:\.\d+)?)$/x or die $1;
        $text = { mul_x => $1, mul_y => $2, 
                      x => $3,     y => $4, lines => [] };
        push @text, $text;
        next;
    }
    if ($_ eq "[1 0 0 1 0 0] Tm\n") {
        undef $text
    }
    if (/^.* (Tj|Td|TJm)$/) {
        $text or next;
        push @{$text->{lines}}, $_;
        next;
    }
    if (/^.* Tf$/) {
        m{^/F(\d+)_0 1 Tf$} or die;
        @text or next;
        push @{$text->{lines}}, $_;
        next;
    }
        if (/(\d+\.\d+) (\d+\.\d+) (\d+\.\d+) (\d+\.\d+) re$/) {
            push @lines, { x => $1, y => $2, w => $3, h => $4 };
            next;
        }

}
close $fh;

print dump (\@parsed), "\n";
