#!/usr/bin/perl

## refdb-pubmed: retrieves an XML representation of
## Pubmed (http://pubmed.org) entries by their Pubmed IDs. This program
## accesses the data through the efetch utility (http://eutils.ncbi.nlm.nih.gov/corehtml/query/static/efetchlit_help.html)

## refdb-pubmed reads Pubmed IDs either on stdin or as parameters and
## writes the query results to stdout

## Usage:
## refdb-pubmed id0 id1... > outfile.xml
## where id0, id1... are Pubmed IDs
##
## refdb-pubmed < infile > outfile.xml
## where infile contains a list of Pubmed IDs, separated by non-numeric
## characters like whitespace, newlines, tabs, or commas

## markus@mhoenicka.de 2009-10-28

##   This program is free software; you can redistribute it and/or modify
##   it under the terms of the GNU General Public License as published by
##   the Free Software Foundation; either version 2 of the License, or
##   (at your option) any later version.
##   
##   This program is distributed in the hope that it will be useful,
##   but WITHOUT ANY WARRANTY; without even the implied warranty of
##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##   GNU General Public License for more details.
   
##   You should have received a copy of the GNU General Public License
##   along with this program; if not, see <http://www.gnu.org/licenses/>


## this is our all-purpose web client
use LWP::Simple;

## this is to read the command line options
use Getopt::Std;

## this hash will receive the command line options
my %opts;

# some option defaults
my $dbname = "pubmed";
my $mode = "xml";
my $type = "full";

## the command line switches are:
## -d database: [pubmed|pmc|journals|omim], default is pubmed
## -h: prints help
## -m retrieval mode: output data format [xml|html|text|asn.1]
## -t type: [uilist|abstract|citation|medline|full]
getopts('d:hm:t:', \%opts);

## loop over all command line options
while (($key, $value) = each %opts) {
    if ($key eq "d") {
	$dbname = $value;
    }
    elsif ($key eq "h") {
	print "refdb-pubmed retrieves datasets from the Pubmed database\n";
	print "Usage: [perl] refdb-pubmed [-d database] [-h] [-m mode] [-t type] {PMID...}\nPMIDs can either be specified as parameters on the command line, or as a list of these IDs, separated by non-numeric characters (whitespace, newlines, commas, tabs...), which is read from stdin. Output is sent to stdout.\nOptions: -d database Pubmed database [pubmed|pmc|journals|omim], pubmed is\n                     default\n         -h          print this help and exit\n         -m mode     retrieval mode [xml|html|text|asn.1], xml is default\n         -t type     retrieval type [uilist|abstract|citation|medline|full],\n                     full is default\n";
	exit(0);
    }
    elsif ($key eq "m") {
	$mode = $value;
    }
    elsif ($key eq "t") {
	$type = $value;
    }
    #else: do nothing
}

my $pubmedids;

## ARGV contains the remaining non-option parameters
if ($ARGV[0]) {
    while(@ARGV) {
	$pubmedids .= shift(@ARGV) . ",";
    }
    chop($pubmedids); # remove trailing ,
}
else {
    # read stdin. Treat each input line as a series of non-digit-separated ids
    while (<>) {
	chomp();
	my @idline = split(/\D+/);
	foreach $id (@idline) {
	    $pubmedids .= $id . ",";
	}
    }
    chop($pubmedids); # remove trailing ,
}

## getstring is composed of the Pubmed eutils base address, the
## database name (pubmed), the comma-separated id list, the data
## format (xml), and the retrieval type (full)
my $getstring = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?&db=$dbname&id=$pubmedids&retmode=$mode&rettype=$type";

## now actually send query and print results to stdout
getprint $getstring;

exit 0;
