#!/usr/bin/perl -w # tisi2bib is torstens perl program that translates bibliographic references # taken from the ISI Web of Science (inc. INSPEC) and translates them to # BibTeX references. Copyright(C) 2004-6 Torsten Lehmann, tlehmann@unsw.edu.au # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # To use this program first get your references from the ISI Web of # Science website: search for your references, mark all the references # of interest, click on "marked list" and press "save to file" in the # "field tagged" format. Then you'll have references on your disk, which # looks something like this when the default fields are specified: # # FN ISI Export Format # VR 1.0 # TI 1-V power supply CMOS cascode amplifier # AU Lehmann, T. # Cassia, M. # SO IEEE Journal of Solid-State Circuits # PY 2001 # PD July 2001 # VL vol.36, no.7 # JI IEEE J. Solid-State Circuits (USA) # BP 1082 # EP 1086 # PS 1082-6 # ER # # TI An implantable CMOS amplifier for nerve signals # AU Nielsen, J.H. # Lehmann, T. # SO ICECS 2001. 8th IEEE International Conference on Electronics, Circuits # and Systems (Cat. No.01EX483) # PY 2001 # PD 2001 # BP 1183 # EP 1186 # PS 1183-6 # ER # # more records may follow # # EF # # some stats on "save to file" on ISI WOK marked records page: # number of occurences in 405 records of default fiels # AU: 402/405 # BP: 403/405 # ED: 47/405 # EF: 2/405 # EP: 403/405 # ER: 405/405 # FN: 2/405 # JI: 316/405 # PD: 405/405 # PS: 404/405 # PY: 405/405 # SO: 404/405 # TI: 405/405 # VL: 316/405 # VR: 2/405 # # all fields specified (same data set): # AB: 405/405 # AU: 402/405 # AV: 3/405 # BN: 54/405 # BP: 403/405 # CL: 124/405 # CO: 314/405 # CT: 96/405 # CY: 124/405 # DE: 405/405 # DT: 405/405 # ED: 47/405 # EF: 2/405 # EP: 403/405 # ER: 405/405 # FN: 2/405 # ID: 405/405 # JI: 316/405 # LA: 405/405 # NR: 402/405 # PD: 405/405 # PS: 404/405 # PU: 283/405 # PV: 89/405 # PY: 405/405 # SN: 315/405 # SO: 404/405 # SP: 62/405 # TI: 405/405 # UT: 405/405 # VL: 316/405 # VR: 2/405 # # old records look more like this... # # FN ISI Export Format # VR 1.0 # PT J # AU ABBAS, PJ # BROWN, CJ # TI ELECTRICALLY EVOKED AUDITORY BRAIN-STEM RESPONSE - GROWTH OF RESPONSE # WITH CURRENT LEVEL # SO HEARING RESEARCH # PD JAN # PY 1991 # VL 51 # IS 1 # BP 123 # EP 138 # UT ISI:A1991EW97300010 # ER # # EF # # while some entries might look like this... # # FN ISI Export Format # VR 1.0 # PT J # AU Piyathaisere, DV # Margalit, E # Chen, SJ # Shyu, JS # D'Anna, SA # Weiland, JD # Grebe, RR # Grebe, L # Fujii, G # Kim, SY # Greenberg, RJ # De Juan, E # Humayun, MS # TI Heat effects on the retina # SO OPHTHALMIC SURGERY LASERS & IMAGING # PD MAR-APR # PY 2003 # VL 34 # IS 2 # BP 114 # EP 120 # UT ISI:000184623900004 # ER # # EF # # revision history # ---------------- # v. 0.51 added functions for automatic capitalisation of acronyms # v. 0.50 BP-EP entries alternative to PS entries fixed # VL-IS entries alternative to VL vol, num entries fixed # All-cap journal names fixed # Dashed-Cap title problem fixed # v. 0.20 fixed all-cap author entries # v. 0.13 first working version # # known problems # -------------- # automatic detection of publication type (article/inproceedings) is by # no means perfect. It is better than relying on the designated # database field, however! # auto capitalisation protection of names (Darwin, Edinburgh, ...) # need to be linked to database... # the ISI database has errors in it (some which this program tries to # address) - always check the output to see if it looks right. # # OK, thats enough talking; here comes the program: #------------------------------------------------------------------- #- first heaps of subrutine definitions ---------------------------- #------------------------------------------------------------------- sub do_authors { my (@autors, $author, $first, $s); $first = 1; $s = ""; @authors = split(';',$_[0]); foreach $author (@authors) { if (!$first) { $s = $s . " and " }; $first = 0; $author =~ s/\.([[:upper:]])/\. $1/g; $s = $s . $author } return $s; } sub cap_title { my (@words, $word, $npw, $pct, $first, $s, $title); return $_[0] if !$usc; $first = 1; $title = $_[0]; if (uc($title) eq $title) { $title=lc($title) }; $s = ""; @words = split(' ',$title); foreach $word (@words) { $word =~ /^([[:punct:]]*)(.*)$/; $npw = $2; $pct = $1; if (!$first) { $s = $s . " " }; if (!$first && index(" a an and as at by do does doing for from " . "in is of on " . "some the to with without "," " . lc($npw) . " ")>=0) { $s = $s . lc($word); next; } $first = 0; if ($word eq "-") { $s = $s . "---"; next; }; if ($word =~ /.+-.*/) { $word =~ s/-/ /g; $word = cap_title($word); $word =~ s/ /-/g; $s = $s . $word; next; }; if (lcfirst($npw) ne lc($npw)) { $s = $s . "{" . $word . "}"; next; } if (index(" Darwin Edinburgh Einstein Miller " . "Utah " . "Wheatstone ", " " . ucfirst($npw) . " ")>=0) { $s = $s . "{" . $pct . ucfirst($npw) . "}"; next; } if (index(" CMOS IEEE MOS " . "MOSFET " . "MOST ", " " . uc($npw) . " ")>=0) { $s = $s . "{" . $pct . uc($npw) . "}"; next; } $s = $s . $pct . ucfirst($npw); } return $s; } sub find_month { my $s; $s=$_[0]; return "Jan." if $s =~ /jan/i; return "Feb." if $s =~ /feb/i; return "Mar." if $s =~ /mar/i; return "Apr." if $s =~ /apr/i; return "May" if $s =~ /may/i; return "Jun." if $s =~ /jun/i; return "Jul." if $s =~ /jul/i; return "Aug." if $s =~ /aug/i; return "Sep." if $s =~ /sep/i; return "Oct." if $s =~ /oct/i; return "Nov." if $s =~ /nov/i; return "Dec." if $s =~ /dec/i; return ""; } sub abbr_journal { my $s; $s = $_[0]; $s =~ s/([^\(\)]+[^ \)]) *\([^\(\)]+\) *$/$1/; if (uc($s) eq $s) { $s=lc($s) }; if ($ajn) { $s =~ s/conference +proceedings[[:punct:]]*/Proc./ig; $s =~ s/proceedings/Proc./ig; $s =~ s/annual/Ann./ig; $s =~ s/international/Int./ig; $s =~ s/symposium/Symp./ig; $s =~ s/conference/Conf./ig; $s =~ s/congress/Cong./ig; $s =~ s/journal/J./ig; $s =~ s/transactions/Trans./ig; $s =~ s/letters/Lett./ig; $s =~ s/magazine/Mag./ig; $s =~ s/ +of +the +/ /ig; $s =~ s/ +of +/ /ig; $s =~ s/ +on +/ /ig; } return cap_title($s); } sub fix_authors { my (@autors, $author, $first, $s, $gnam, $fnam, @fnames, $fname, $first1); $first = 1; $s = ""; @authors = split(';',$_[0]); foreach $author (@authors) { if (!$first) { $s = $s . "; " }; $first = 0; ($fnam,$gnam) = split(/ *, */,$author); if ($gnam eq "") { print STDERR "missing given name in AU=$author\n"; } $gnam =~ s/\.//g; $gnam = join(". ", split(/ */,$gnam)) . "."; if ($fnam eq uc($fnam)) { @fnames = split(/ */,$fnam); $first1 = 1; $fnam=""; foreach $fname (@fnames) { if (!$first1) { $fnam = $fnam . " " }; $first1 = 0; $fnam = $fnam . ucfirst(lc($fname)); } } $s = $s . $fnam . ", " . $gnam; } return $s; } sub process_field { # ** field always there (default field set) # *+ field usually there # *- field might be there # *? field very occationally there # L** L*+ L*- L*? as above but for long field set my ($s,$s1,$s2,$newdt); $s = $_[0]; $s =~ /^FN +([^ ]+.*)$/ && do { # File type. 'ISI Export Format' ** }; $s =~ /^VR +([^ ]+.*)$/ && do { # File Version number ** }; $s =~ /^AU +([^ ]+.*)$/ && do { # Authors *+ $rec{'author'}=fix_authors($1); }; $s =~ /^TI +([^ ]+.*)$/ && do { # Title ** $rec{'title'}=cap_title($1); }; $s =~ /^ED +([^ ]+.*)$/ && do { # Editors *- $rec{'editor'}=fix_authors($1); }; $s =~ /^SO +([^ ]+.*)$/ && do { # journal/volume title, in full *+ $s1 = $1; if ($s=~/proceeding/i || $s=~/conference/i || $s=~/symposium/i || $s=~/colloqu/i || $s=~/advances/i || $s=~/congress/i || $s=~/workshop/i || $s=~/meeting/i || $s=~/seminar/i) { $newdt="Inproceedings"; } elsif ($s=~/journal/i || $s=~/transaction/i || $s=~/letter/i || $s=~/acta/i || $s=~/magazine/i || $s=~/annal/i) { $newdt="Article"; } else { # default should be Article; but could be Misc $newdt="Article"; } if ($rec{'doctype'} && $rec{'doctype'} ne $newdt && !$upt) { print STDERR "$s1: prediction=$newdt, DT=$rec{'doctype'}\n"; } if (!$rec{'doctype'} || $upt) { $rec{'doctype'} = $newdt; } $rec{'journal'}=$s1 if !$rec{'journal'} || $upt && $newdt eq "Article"; }; $s =~ /^CT +([^ ]+.*)$/ && do { # Conference title L*-- # $rec{'journal'}=$1; do nothing }; $s =~ /^CY +([^ ][^-]*)(.*) +([[:digit:]]+)$/ && do { # Conference year and date L*- $rec{'year'} = $3; $s1 = find_month($1); $s2 = find_month($2); if ($s1 ne "") { $rec{'month'} = $s1; $rec{'month'} = $rec{'month'} . "--" . $s2 if $s2 ne ""; } else { $rec{'month'} = $s2 if $s2 ne ""; } }; $s =~ /^CL +([^ ]+.*)$/ && do { # Conference location L*- $s1 = $1; $s1 =~ s/^([^,]*).*$/$1/; $rec{'address'} = $s1; }; $s =~ /^DT +([^ ]+.*)$/ && do { # Document type L** if ($s =~ /conference/i) { $newdt = "Inproceedings"; } elsif ($s =~ /journal/i || $s =~ /review/i || $s =~ /article/i) { $newdt = "Article"; } else { $newdt = "Misc"; print STDERR "Document type '$s' unknown\n"; }; if ($rec{'doctype'} && $rec{'doctype'} ne $newdt && !$upt) { print STDERR "$rec{'journal'}: prediction=$rec{'doctype'}, DT=$newdt\n"; } $rec{'doctype'} = $newdt if !$upt || !$rec{'doctype'}; }; $s =~ /^PU +([^ ][^,]+)(.*)$/ && do { # Publisher (inc. city) L*- $rec{'publisher'} = $1; if ( $2 ne "" && !$rec{'address'} ) { $2 =~ /[, ]+([^,]+),.*/ && do { $rec{'address'} = $1 } } }; $s =~ /^DE +([^ ]+.*)$/ && do { # Authors' original keywords L** }; $s =~ /^JI +([^ ]+.*)$/ && do { # ISO journal title abbr *- (if journal?) $rec{'abjournal'} = $1; }; $s =~ /^AB +([^ ]+.*)$/ && do { # Abstract L** }; $s =~ /^PS +([[:digit:]\-, \/]+[[:digit:]]).*$/ && do { # number of pages *+ $rec{'pages'}=$1; $s =~ /vol[[:punct:] ume]*(.*)/ && do { $rec{'volume'} = $1 if !$rec{'volume'}; } }; $s =~ /^BP +([^ ]+.*)$/ && do { # Beginnig page *+ (sometimes redundant) $s1 = $1; if ($rec{'pages'} && $rec{'pages'} =~ /\-[[:digit:]]+/) { $rec{'pages'}=$s1 . $rec{'pages'}; } $rec{'pages'}=$s1 . "-" if !$rec{'pages'}; }; $s =~ /^EP +([^ ]+.*)$/ && do { # Ending page *+ (sometimes redundant) $s1 = $1; if ($rec{'pages'} && $rec{'pages'} =~ /[[:digit:]]+\-/) { $rec{'pages'}=$rec{'pages'} . $s1; } $rec{'pages'}="-" . $s1 if !$rec{'pages'}; }; $s =~ /^PY +([^ ]+.*)$/ && do { # Publication year ** $rec{'year'}=$1 if !$rec{'year'}; }; $s =~ /^PD +([^ ]?[^-]*)(.*)$/ && do { # Publication date ** $s1 = find_month($1); $s2 = find_month($2); if ($s1 ne "") { $rec{'month'} = $s1; $rec{'month'} = $rec{'month'} . "--" . $s2 if $s2 ne ""; } else { $rec{'month'} = $s2 if $s2 ne ""; } }; $s =~ /^VL +([^ ]+.*)$/ && do { # Volume *- (if JI?) $s1 = $1; $s =~ /vol[[:punct:] ume]*([[:alnum:]-]+)/ && do { $rec{'volume'} = $1; }; $rec{'volume'} = $s1 if !$rec{'volume'}; $s =~ /n[ou][[:punct:] mber]*([[:alnum:]-]+)/ && do { $rec{'number'} = $1; } }; $s =~ /^IS +([^ ]+.*)$/ && do { # issue $rec{'number'} = $1 if !$rec{'number'}; }; # no action for these fields # $s =~ /^ER +([^ ]+.*)$/ && do {}; # end of a record ** # dont bother about these fields # $s =~ /^NR +([^ ]+.*)$/ && do {}; # Cited reference count L*+ # $s =~ /^SN +([^ ]+.*)$/ && do {}; # ISSN L*+ # $s =~ /^BN +([^ ]+.*)$/ && do {}; # some other ISSN like number L*- # $s =~ /^PV +([^ ]+.*)$/ && do {}; # Publisher City ??? L*? # $s =~ /^AV +([^ ]+.*)$/ && do {}; # some publisher title thing L*? # $s =~ /^LA +([^ ]+.*)$/ && do {}; # Language L** # $s =~ /^SP +([^ ]+.*)$/ && do {}; # Sponsor L*? # $s =~ /^UT +([^ ]+.*)$/ && do {}; # ISI unique article identifier L** # $s =~ /^ID +([^ ]+.*)$/ && do {}; # New keywords given by ISI (coded) L** # $s =~ /^CO +([^ ]+.*)$/ && do {}; # dunno L*- # $s =~ /^EF +([^ ]+.*)$/ && do {}; # end of file ** # other possible fields... # $s =~ /^PN +([^ ]+.*)$/ && do { $ref{''}=$1 }; # Part number # $s =~ /^PT +([^ ]+.*)$/ && do { $ref{'pubtype'}=$1 }; # Publication type # $s =~ /^C1 +([^ ]+.*)$/ && do {}; # Research addresses # $s =~ /^CR +([^ ]+.*)$/ && do {}; # Cited references # $s =~ /^TC +([^ ]+.*)$/ && do {}; # Times cited # $s =~ /^SE +([^ ]+.*)$/ && do { $ref{''}=$1 }; # Book series title # $s =~ /^SU +([^ ]+.*)$/ && do {}; # Supplement # $s =~ /^SI +([^ ]+.*)$/ && do {}; # Special issue # $s =~ /^GA +([^ ]+.*)$/ && do {}; # ISI document delivery number # $s =~ /^WP +([^ ]+.*)$/ && do {}; # Publisher web address # $s =~ /^RP +([^ ]+.*)$/ && do {}; # Reprint address # $s =~ /^J9 +([^ ]+.*)$/ && do {}; # 29-character journal title abbreviation } sub print_field { my (@words, $s, $word, $cleft, $ps); $s = $_[0]; $ps = $_[1]?$_[1]:" "; $s =~ s/&/\\&/g; $s =~ s/\$/\\\$/g; $s =~ s/%/\\%/g; @words = split(' ',$s); $cleft = 78 - length($ps); print $ps; foreach $word (@words) { if (length($word)<$cleft) { print " " . $word; $cleft -= length($word)+1; } else { print "\n" . $ps . " " . $word; $cleft = 72 - length($word) - length($ps); } } print "\n"; } sub comp_ps { my ($s, $s1,$s2, $de); $s = $_[0]; # 123-1123 -> 123-1123 # 123-234 -> 123-234 # 123-134 -> 123-34 # 123-124 -> 123-4 return $s if $s !~ /([[:digit:]]+)\-([[:digit:]]+)/; $s1 = $1; $s2 = $2; return $s if (length($s1) != length($s2)); $de = 0; while ($de < length($s1) && substr($s1,$de,1) eq substr($s2,$de,1)) { $de = $de +1; }; return $s1 if ($de == length($s1)); return $s1 . "-" . substr($s2,$de); } sub print_record { my (%rec,$journ); %rec = @_; $journ=""; if ($isc) { print $rec{'key'},"\n"; print_field("SO " . $rec{'journal'},"%-") if $rec{'journal'}; }; print "@" . $rec{'doctype'} . "{" . $rec{'docref'} . ",\n"; print_field("Author = {" . do_authors($rec{'author'}) . "},"); print_field("Title = {" . $rec{'title'} . "},"); print_field("Editor = {" . do_authors($rec{'editor'}) . "},") if $rec{'editor'}; $journ = $rec{'journal'} if $rec{'journal'}; $journ = $rec{'abjournal'} if $uia && $rec{'abjournal'}; if ($rec{'doctype'} eq "Article") { print_field("Journal = {" . abbr_journal($journ) . "},") if $journ; } elsif ($rec{'doctype'} eq "Inproceedings") { print_field("Booktitle = {" . abbr_journal($journ) . "},") if $journ; } else { # is "Misc print_field("Howpublished = {" . abbr_journal($journ) . "},") if $journ; } print_field("Publisher = {" . cap_title($rec{'publisher'}) . "},") if $rec{'publisher'}; print_field("Address = {" . $rec{'address'} . "},") if $rec{'address'}; print_field("Volume = {" . $rec{'volume'} . "},") if $rec{'volume'}; print_field("Number = {" . $rec{'number'} . "},") if $rec{'number'}; print_field("Month = {" . $rec{'month'} . "},") if $rec{'month'}; print_field("Pages = {" . comp_ps($rec{'pages'}) . "},") if $rec{'pages'}; print " Year = {" . $rec{'year'} . "}"; if ($iwf) { print ",\n Where = {Web of Science} }\n\n"; } else { print " }\n\n"; } } sub make_docref { my (@autors, $author, $s, $year, $k, $anum); $year = $_[1]; $s = ""; $k = "%s"; @authors = split(';',$_[0]); $anum=0; foreach $author (@authors) { $anum++; if ($author =~ /,/) { $author =~ s/[a-z ]*(.*),.*/$1/; # remove "de von der" and initials } else { $author =~ s/^.*?([^ ]+)$/$1/; # remove first names } $k = $k . " " . $author; $s = $s . substr($author,0,3) if $anum<4; } return ($s . substr($year,2,2), $k . " " . $rec{'year'}); } sub process_record { my ($a,$newkey,$newdr); $rec{'doctype'} = "Misc" if !$rec{'doctype'}; $rec{'author'} = '---' if !$rec{'author'}; ($rec{'docref'},$rec{'key'}) = make_docref($rec{'author'},$rec{'year'}); $a='a'; $newdr=$rec{'docref'}; $newkey=$rec{'key'}; while ($usedkeys{$newdr}) { $a++; $newdr=$rec{'docref'} . $a; $newkey=$rec{'key'} . $a; } $rec{'docref'} = $newdr; $rec{'key'} = $newkey; $usedkeys{$newdr} = $rec{'key'}; $allrecs{$newkey}={ %rec }; } #------------------------------------------------------------------- #- main program starts here ---------------------------------------- #------------------------------------------------------------------- #---------------------------------------------- # setup variables $ver = "0.51"; $now = localtime; print STDERR "tisi2bib, ISI to BibTeX converter, v. ", $ver, " Copyright (C) 2004-6 Torsten Lehmann\n"; print STDERR "This software is copyrighted under the ", "Gnu General Public License (www.gnu.org)\n"; print "% This BibTeX data base was generated by tisi2bib v. ",$ver, " (C) 2004-6 GPL\n"; print "% at ",$now, "\n\n"; $s = ""; $jc = ' '; %rec=(); %allrecs=(); %usedkeys=(); $ajn = 1; # abbreviate jounal name $srt = 1; # sort output $iwf = 1; # include 'where' field $isc = 1; # include journal source comments $upt = 1; # use predicted document type $uia = 1; # use isi journal abbreviation $usc = 1; # use smart title cap #---------------------------------------------- # process comman line options while ($_=$ARGV[0], $_ && $_ =~ /^-/) { shift @ARGV; last if /^--$/; /^-a/ && do { # abbreviate j/c name $ajn = 1; next; }; /^-noa/ && do { # no abbreviate j/c name $ajn = 0; next; }; /^-c/ && do { # source as coments $isc = 1; next; }; /^-noc/ && do { # no source as coments $isc = 0; next; }; /^-h/ && do { # help! print STDERR "tisi2bib [switches] file file ....\n"; print STDERR " -[no]a Abbreviate Journal/Conference name (default)\n"; print STDERR " -[no]c Include sort coment (default)\n"; print STDERR " -h This message\n"; print STDERR " -[no]i Use ISI Journal abbreviation (default)\n"; print STDERR " -[no]p Use predicted publication type (default)\n"; print STDERR " -[no]s Sort output (default)\n"; print STDERR " -[no]t Use smart title capitalisation (default)\n"; print STDERR " -[no]w Include 'Where' field (default)\n"; exit -1; }; /^-i/ && do { # use isi abbreviation $uia = 1; next; }; /^-noi/ && do { # no use isi abbreviation $uia = 0; next; }; /^-p/ && do { # use predicted data type $upt = 1; next; }; /^-nop/ && do { # no use predicted data type $upt = 0; next; }; /^-s/ && do { # sort output $srt = 1; next; }; /^-nos/ && do { # no sort output $srt = 0; next; }; /^-t/ && do { # smart title cap $usc = 1; next; }; /^-not/ && do { # no smart title cap $usc = 0; next; }; /^-w/ && do { # include where field $iwf = 1; next; }; /^-now/ && do { # no include where field $iwf = 0; next; }; print STDERR "Unknown switch $_; try -h\n"; exit -1; }; #---------------------------------------------- # main loop goes throug all remining arguments while (<>) { chomp; /^ +([^ ]+.*)$/ && do { # join multi-line fields $s = $s . $jc . $1; next }; if (!($s eq "")) { # process field if not emtpy process_field($s); $s = ""; }; (/^ER.*$/ || /^ *$/) && do { # end of record if ((scalar keys %rec)!=0) { # process it if not empty process_record; } %rec=(); next; }; (/^AU .*$/ || /^ED .*$/) && do { # AU and ED fields have sub-fields $jc = ';'; $s = $_; next; }; $jc = ' '; $s = $_; } if ($srt) { @order = (sort keys %allrecs); } else { @order = (keys %allrecs); } foreach $key (@order) { print_record(%{$allrecs{$key}}); }