#!/usr/local/bin/perl # # file: split-edgar # auth: Brad Burdick # desc: split SEC EDGAR data file into SGML header information file and # document text file. # # usage: split-edgar [-d datadir] [-e errdir] [-v] [-w workdir] # [input_file(s)] # ########################################################################## # Copyright (c) 1994, 1995 Internet Multicasting Service # # The SEC EDGAR Level 1 Dissemination processing software ("software") # was developed by the Internet Multicasting Service and may # be used for academic, research, government, and internal business # purposes without charge. You may not resell this code or include it # in a product that you are selling without prior permission of the # Internet Multicasting Service. # # This software is provided ``as is'', without express or implied # warranty, and with no support nor obligation to assist in its # use, correction, modification or enhancement. We assume no liability # with respect to the infringement of copyrights, trade secrets, or any # patents, and are not responsible for consequential damages. Proper # use of the software is entirely the responsibility of the user. ########################################################################## eval 'exec /usr/bin/perl -s $0 ${1+"$@"}' if 0; # who am i? ($prog = $0) =~ s#.*/##; # where we find our local libraries push(@INC, '/usr/local/ims/lib'); # for processing command line options require 'getopts.pl'; # Edgar general utility routines require 'edgar-util.pl'; # date stamp for IMS header - century is hard-coded @date = localtime; $datestamp = sprintf("%04d%02d%02d", $date[5]+1900, $date[4]+1, $date[3]); # process command line options, if any &Getopts('d:e:vw:'); # verbose output? $verbose = defined($opt_v); # where to place submissions $datadir = defined($opt_d) ? "$opt_d" : "/in/edgar"; &makepath($datadir, 0775); # where to place normal submissions $workdir = defined($opt_w) ? "$datadir/$opt_w" : "$datadir/work"; &makepath($workdir, 0775); # where to place exception submissions (errors) $errdir = defined($opt_e) ? "$datadir/$opt_e" : "$datadir/exceptions"; &makepath($errdir, 0775); # base file name (accession # for now) $accno = ''; # document text @document = (); # header text @header = (); # are we processing the header text? $in_hdr = 0; # are we processing the document text? $in_doc = 0; # take data from stdin if no file provided if ($#ARGV < 0) { push(@ARGV, "<&STDIN"); } foreach $file (@ARGV) { open(IN, "$file") || die "$prog: $file: $!\n"; # will block if input is STDIN and empty... while ($line = ) { chop($line); # # assumes SUBMISSIONs are not nested # ignores junk outside of ... nest # if (! $in_hdr && ! $in_doc) { if ($line =~ '') { # start of header $in_hdr = 1; chop($line = ); # # we'll use the accession number as a file name for now. # if ($line =~ '') { ($accno = $line) =~ s/(\S+)/\1/; $outfile = "$workdir/$accno.hdr.sgml"; if (-e $outfile) { warn "$prog: $outfile already exists!!\n"; $outfile = &get_next_file("$errdir/$accno.hdr.dup01"); print "Processing $outfile ...\n" if $verbose; } } else { # error - accession # MUST be next $outfile = &get_next_file("$errdir/$accno.hdr.err01"); print "Processing $outfile ...\n" if $verbose; } $ims_hdr = "$accno.hdr.sgml : $datestamp"; push(@header, $ims_hdr); push(@header, $line); open(OUT, ">$outfile") || die "$prog: $outfile: $!"; } } elsif ($line =~ '' && ! $in_doc) { # end of header $in_hdr = 0; $in_doc = 1; push(@header, ""); # save the header to $outfile print OUT join("\n", @header), "\n"; # reset the array @header = (); $outfile = "$workdir/$accno.txt"; if (-e $outfile) { warn "$prog: $outfile already exists!!\n"; $outfile = &get_next_file("$errdir/$accno.txt.dup01"); print "Processing $outfile ...\n" if $verbose; } open(OUT, ">$outfile") || die "$prog: $outfile: $!"; # now process the document text push(@document, $line); } elsif ($line =~ '') { # end of document(s) $in_hdr = 0; $in_doc = 0; # save the document(s) to $outfile print OUT join("\n", @document), "\n"; # reset the array @document = (); } elsif ($in_hdr) { push(@header, $line); # save the header line } else { push(@document, $line); # save the document line } } } exit 0;