#!/usr/bin/perl -w # Command line usage: perlESTNameConversioninput_fileoutput_fileuser_defined_extension # This script takes the EST entries extracted from NCBI website in FASTA format as input; # searches the description line for specific string of "ESTxxxx" (xxxx is a number); # keeps only "ESTxxxx" as the entry name; # and adds user-given extension name to the end of the entry names in the format of "EntryName.UserGivenExtension". # This script is used to make EST sequences from different organisms distinguishable just by name. my @seqs; my $answer; open INPUT, "$ARGV[0]" or die "Can't open input file: $!"; while () { chomp; if ($_ =~ /^>/) { $_ =~ s/^(>).*(EST\d+).*$/$1$2/; if (defined ($ARGV[2])) { $_ .= $ARGV[2]; } } push @seqs, $_; } close INPUT or die "Can't close input file: $!"; # Check whether the output file is already existed. # If so, don't overwrite it. if (-e $ARGV[1]) { while (1) { print "Output file existed, overwrite? yes or no: \n"; chomp ($answer = ); if ($answer ne "yes" && $answer ne "no") { print "Please answer yes or no!!! \n"; next; } else { last; } } } else { $answer = "yes"; } if ($answer eq "no") { print "QUIT without overwriting existed output file!\n"; exit; } else { # Write to the output file. open (OUTPUT, ">$ARGV[1]") or die "Can't open output file: $!"; foreach (@seqs) { print OUTPUT "$_\n"; } }