#!/usr/bin/perl
#
#

my $filelines=1000000;

my $bytes=0;
my $count=0;
my $filecount=0;
my $filenum=0;

my $filename = sprintf("uniprot%04d.nt.gz", $filenum);
my $head = `cat head.xml`;
open F, "| rapper - http://www.uniprot.org | gzip --fast -c > $filename";
open C, "| parallel rdf2hdt -B 'http://www.uniprot.org' {} {.}.hdt";

while(<>) {
	$bytes += length( $_ );
	$count++;
	$filecount++;

	if( $filecount>$filelines) {
		if($_=~/^<rdf:Description rdf:about/) {
			print STDERR "File: $filename Lines: $filecount Total Lines: $count Total bytes: $bytes\n";
			print F "</rdf:RDF>\n";
			close F;

			print C "$filename\n";

			$filenum++;
			$filecount=0;
			$filename = sprintf("uniprot%04d.nt.gz", $filenum);
			open F, "|rapper - http://www.uniprot.org | gzip --fast -c > $filename";
			print F "$head\n";
		}
	}
	print F $_;
}
print F "</rdf:RDF>\n";
close F;

print STDERR "Finished: Total files: $filenum Total Lines: $count Total bytes: $bytes\n";
print C "$filename\n";

close C;
