taxr/taxdata_import.pl

#!/usr/bin/perl -w

use strict;
use Data::Dumper;
use DBI();

my $INFILE = "taxdata.csv";

open(my $fh, "<", $INFILE) or die "Cannot open $INFILE for reading: $!";

my $database = 'taxbrackets';
my $hostname = '127.0.0.1';
my $username = 'root';
my $password = '';

my $dsn = "DBI:mysql:database=$database;host=$hostname;port=3306";
my $dbh = DBI->connect($dsn, $username, $password) or die "Couldn't connect to MySQL server at $hostname (user $username): $!";

# $dbh->trace(2);

my %YEARS;

while (defined(my $year_line = <$fh>)) {
   chomp($year_line);
   print "$year_line\n";
   # if ($line =~ /^,,(Nominal),,,,(\d{4}),,,,,,,,,,,,,,(Adjusted for Inflation)?,,,,(\d{4}),,,CPI: ([\d\.]+)(?: \(Projected\))?,,,,,,,$/) {
   # ,,Nominal,,,,2011,,,,,,,,,,,,,,Adjusted for Inflation,,,,2011,,,CPI: 224.380 (Projected),,,,,,,
   # ,,Nominal,,,,2010,,,,,,,,,,,,,,Adjusted for Inflation,,,,2010,,,CPI: 218.056,,,,,,,
   if ($year_line =~ /,,(Nominal),,,,(\d{4})(?: \(\w\))?,,,,,,,,,,,,,,(Adjusted for Inflation)?,,,,(\d{4})(?: \(\w\))?,,,CPI: ([\d\.]+)(?: \(Projected\))?,,,,,,,/) {
      my $yr = $2;
      my $cpi = $5;
      print "year: $yr  cpi: $cpi\n";
      
      # Filing status line.
      my $filing_status_line = <$fh>;
      chomp($filing_status_line);
      my @statuses = ($filing_status_line =~ /,(Married Filing Jointly),,,,(Married Filing Separately),,,,(Single),,,,(Head of Household),,,,,,(Married Filing Jointly),,,,(Married Filing Separately),,,,(Single),,,,(Head of Household),,,/);
      print "$filing_status_line\n";
      print "^... is a filing status line: " . join(", ", map { defined($_) ? "'$_'" : 'undef' } @statuses) . "\n";
      
      # Skip headers...
      <$fh>; # skip "Marginal"|"Tax Brackets"
      <$fh>; # skip "Tax Rate"|"Over"|"But Not Over"
      
      # Compose nominal & adjusted lists for this year.
      my @notes;
      my %nominal;
      my %adjusted;
      while (defined(my $data_line = <$fh>)) {
         chomp($data_line);
         print "$data_line\n";
         
         # Match & extract columns.
         my $percent = '[\d\.]+%';
         my $dollar = '\$[\d,]+|-';
         my $q = '"?';
         my $colgroup = "(?:($percent)?,$q($dollar)?$q,$q($dollar)?$q|,$q.+$q,)";
         my $datagroup = "$colgroup,,$colgroup,,$colgroup,,$colgroup";
         my $line_regex = "$datagroup,,,,$datagroup";
         
         #                      Columns:
         #           Nom                       Adj
         #  MFJ   MFS   Sng   HoH     MFJ   MFS   Sng   HoH
         # R O B R O B R O B R O B   R O B R O B R O B R O B
         if ($data_line =~ /^,{34}/) {
            print "^... is a blank line.\n";
            last;
         } elsif (my @matches = ($data_line =~ /^$line_regex/)) {
            print "^... is a data line: " . join(", ", map { defined($_) ? "'$_'" : 'undef' } @matches) . "\n";
            # print Dumper(@matches) . "\n\n";
            
            @matches = map { if (defined($_)) { s/[,\$%]//g; /^[\d\.]+$/ ? $_ * 1.0 : $_; } } @matches;
            
            # Nominal data group.
            NOMINAL: for my $i (0..3) {
               my $first = $i * 3;
               
               # This column is blank / has a note instead of data.
               next NOMINAL if $matches[$first] eq '';
               
               my %rate;
               my @keys = ('rate', 'from', 'to');
               for my $i (0..2) {
                  $rate{$keys[$i]} = $matches[$first + $i];
               }
               my $status = $statuses[$i]; # status this column is under
               
               $dbh->do("INSERT INTO `brackets` (`year`, `status`, `rate`, `from`, `to`, `dollars`, `cpi`)" .
                        "VALUES (?, ?, ?, ?, ?, ?, ?)",
                        undef, $yr, $status, $rate{'rate'}, $rate{'from'}, $rate{'to'} eq '-' ? undef : $rate{'to'}, 'nominal', $cpi);
               
               if (!defined($nominal{$status})) {
                  $nominal{$status} = ();
               }
               
               push @{ $nominal{$status} }, { %rate };
            }
            
            # Adjusted data group.
            ADJUSTED: for my $i (0..3) {
               my $first = $i * 3 + 12;
               
               # This column is blank / has a note instead of data.
               next ADJUSTED if $matches[$first] eq '';
               
               my %rate;
               my @keys = ('rate', 'from', 'to');
               for my $i (0..2) {
                  $rate{$keys[$i]} = $matches[$first + $i];
               }
               my $status = $statuses[$i]; # status this column is under
               
               $dbh->do("INSERT INTO `brackets` (`year`, `status`, `rate`, `from`, `to`, `dollars`)" .
                        "VALUES (?, ?, ?, ?, ?, ?)",
                        undef, $yr, $status, $rate{'rate'}, $rate{'from'}, $rate{'to'} eq '-' ? undef : $rate{'to'}, 'adjusted');
               
               if (!defined($adjusted{$status})) {
                  $adjusted{$status} = ();
               }
               
               push @{ $adjusted{$status} }, { %rate };
            }
            # 1, 2, 3: $nom{$statuses[0]}
            # 4, 5, 6: $nom{$statuses[1]}
            # 7, 8, 9: $nom{$statuses[2]}
            # 10, 11, 12: $nom{$statuses[3]}
         } elsif ($data_line =~ /^"?([^,].+)"?,{18}"?([^,].+)"?,{16}/) {
            my $note = $1;
            print "^... is a note line: '$note'\n";
            $note =~ s/""/"/g;
            $note =~ s/^\(\w\) //g;
            push @notes, $note;
         } else {
            print "^ line did not match.\n";
         }
         # print Dumper(%nominal) . "\n\n";
         # print Dumper(%adjusted) . "\n\n";
         # last;
      }
      
      for (@notes) {
         $dbh->do("INSERT INTO `notes` (`year`, `note`)" .
                  "VALUES (?, ?)",
                  undef, $yr, $_);
      }
      
      $YEARS{$yr} = {
         'nominal' => { %nominal },
         'adjusted' => { %adjusted },
         'notes' => [ @notes ]
      };
      # last if (scalar keys %YEARS) == 40;
   } else {
      print "^ line did not match.\n";
   }
   # last;
}
print Dumper(%YEARS) . "\n\n";

$dbh->disconnect();

# %YEARS = {
#    1913: { # %year
#       nominal: { # %nom
#          'Married Filing Jointly': [
#          {
#             rate: 1%,
#             over: 10000,
#             but_not_over: 100000
#          },
#          {
#             rate: 2%,
#             over: 20000,
#             but_not_over: 200000
#          },
#          ],
#          'Married Filing Separately': {
#             rate: 1%,
#             over: 10000,
#             but_not_over: 100000
#          }
#       },
#       adjusted: { # %adj
#          
#       },
#       notes: { # @notes
#          
#       }
#    },
#    1914: {
#       
#    },
#    ...
# }


# [
#    {
#       "year": 1913,
#       "cpi": ...,
#       "statuses": {
#          "Married Filing Jointly": [
#             ...
#          ]
#       },
#       "notes": [
#          ...
#       ]
#    }
# ]