#!/usr/bin/perl #Take variables from the fields filled out on the previous web page require 'miner-lib'; my $dir = "/share/bread/www-data/data/webminer"; &ReadParse; foreach (keys(%in)) { $in{$_} =~ s/[^A-Za-z0-9\.\,\_\-\[\]\{\}\*\?\+\\]//g; } #Print appropriate header information for HTML or spreadsheet output if ($in{tabdelimited}) { print "Content-type: spreadsheet/tab-delimited\n\n"; } else { print &PrintHeader; } #Decide whether to print second page (where you set thresholds) #or final output page (with results of the search) if ($in{makejspage}) { &MakeJSPage; } else { &DoSearch; } #Generate second page, which is a form listing the criteria selected on #the first page and a field to set the expression threshold (or motif #to search for, etc.) for each criterion sub MakeJSPage { print '\n"; @choices = ("empty", split(/\\0/, $in{chosen})); $number_of_crits = $#choices; print ""; print "
"; foreach $x (1..$#choices) { if ($choices[$x] =~ /^exp\.orfs\.maxh/) { &PrintMaxh; } elsif ($choices[$x] =~ /^exp\.orfs\.pep/) { &PrintMotif; } elsif ($choices[$x] =~ /^exp\.orfs\.prom/) { &PrintPromoter; } elsif ($choices[$x] =~ /^exp\.orfs\.iso/) { &PrintIso; } elsif ($choices[$x] =~ /^exp\.orfs\.mw/) { &PrintMass; } elsif ($choices[$x] =~ /^exp\.orfs\.dups/) { &PrintDups; } elsif ($choices[$x] =~ /^exp\.([^\.]*)\.([^\.]*)/) { &LookupExpNames; $class = $1; $expt = $2; &PrintExpJS; } print "

"; } print ""; print "Output as tab-delimited instead of HTML table
"; print ""; } #Print the criterion fields for a MaxH search sub PrintMaxh { print "Criterion $x: Maximum hydrophobicity value of the protein"; print "
Help with MaxH searches
"; print ""; print "Proteins with MaxH values "; print ""; print ""; } #Print the criterion fields for a protein motif search sub PrintMotif { print "Criterion $x: Proteins containing the following motif"; print "
Help with protein motif searches
"; print ""; print "Proteins with the motif "; print ""; } #Print the criterion fields for a promoter motif search sub PrintPromoter { print "Criterion $x: Genes with the following promoter sequence within 1 kb upstream of their start codons"; print "
Help with promoter searches
"; print ""; print "Promoters that contain "; print ""; } #Print the criterion fields for an isoelectric point search sub PrintIso { print "Criterion $x: Predicted isoelectric point of the protein
"; print ""; print "Proteins with isoelectric points "; print ""; print " pH "; } #Print the criterion fields for a molecular weight search sub PrintMass { print "Criterion $x: Predicted molecular weight of the protein
"; print ""; print "Proteins with molecular weight "; print ""; print " kDa"; } #Print the criterion fields for a gene duplicates search sub PrintDups { print "Criterion $x: Genes contained in duplicated blocks within the genome"; print "
Help with duplicate gene searches
"; print ""; print "Genes that "; print ""; print " contained in a duplicated block in the genome"; } #Get the experiment descriptors for the gene expression searches that #were chosen as criteria (builds the %classdescs and %exptdescs arrays) sub LookupExpNames { open(IN, "<$dir/classdescs"); while () { /^([^\t]*)\t([^\t]*)/; $class = $1; $longcrit = $2; $classdescs{$class} = $longcrit; } open(IN, "<$dir/exptdescs"); while () { /^([^\t]*)\t([^\t]*)/; $expt = $1; $longcrit = $2; $exptdescs{$expt} = $longcrit; } } #Print the criterion fields for a gene expression search sub PrintExpJS { if (!$classdescs{$class}) {$classdescs{$class} = $class;} print "Criterion $x: Expression in $classdescs{$class}
"; print ""; print "Genes which are induced "; print ""; print ""; print "-fold in the experiment "; print ""; if (!$exptdescs{$class.".".$expt}) {$exptdescs{$class.".".$expt} = $expt;} print "" . $exptdescs{$class . "." . $expt} . "
"; } #Go through each criterion and score every ORF as passing it or not #and then print the results table with every ORF (linked to YPD), its #gene name, the matching criteria values for all search criteria, and #its description and pathway information sub DoSearch { foreach $i (1..$in{critnumber}) { $crit = "crit" . $i; if ($in{$crit} eq "exp") { &DoExpSearch; } if ($in{$crit} eq "maxh") { &DoMaxhSearch; } if ($in{$crit} eq "motif") { &DoMotifSearch; } if ($in{$crit} eq "promoter") { &DoPromoterSearch; } if ($in{$crit} eq "iso") { &DoIsoSearch; } if ($in{$crit} eq "mass") { &DoMassSearch; } if ($in{$crit} eq "dups") { &DoDupsSearch; } } &GetNames; &GetPaths; &GetDescs; foreach $matched_orf (keys(%matches)) { if (!($names{$matched_orf})) { $names{$matched_orf} = $matched_orf; } $matchcounts[$matches{$matched_orf}]++; push(@matched_names, $names{$matched_orf}); $orf_by_name{$names{$matched_orf}} = $matched_orf; } if ($in{tabdelimited}) { $format = "

";
		$endformat = "
"; $line = "\n"; $sep = "\t"; } else { print ""; print "
"; print "
Webminer has found
"; $i = $in{critnumber}-3; if ($i<1) {$i=1}; foreach $j ($i..$in{critnumber}-1) { $num_matches = 0; foreach $k ($j..$#matchcounts) { $num_matches += $matchcounts[$k]; } print "$num_matches ORFs meeting $j "; if ($j > 1) { print "criteria,
"; } else { print "criterion,
"; } } if ($#matchcounts>1) { print "and "; } print ""; if (!$matchcounts[$in{critnumber}]) { $matchcounts[$in{critnumber}] = 0; } print "$matchcounts[$#matchcounts] ORFs"; print "
"; print "meeting "; if ($#matchcounts>1) { print "all of "; } print "your search criteria.
"; $format=""; $endformat = "
"; $line = ""; $sep = ""; } print "$format"; print "$line$sep" . "ORF" . $sep . "Name$sep"; &LookupExpNames; foreach $z (1..$in{critnumber}) { if ($in{"expt" . $z}) { print $exptdescs{$in{"expt" . $z}} . "$sep"; } if ($in{"crit" . $z} =~ /maxh/) { print "MaxH$sep"; } if ($in{"crit" . $z} =~ /motif/) { $motif = $in{"motif" . $z}; print "Motif '$motif'$sep"; } if ($in{"crit" . $z} =~ /promoter/) { $promoter = $in{"promoter" . $z}; print "Promoter '$promoter'$sep"; } if ($in{"crit" . $z} =~ /mass/) { print "M.W.$sep"; } if ($in{"crit" . $z} =~ /iso/) { print "Isoelectric Pt.$sep"; } if ($in{"crit" . $z} =~ /dups/) { print "Duplicated Block$sep"; } } print "Pathway $sep"; print "Description $sep"; print "$line"; foreach $matched_name (sort(@matched_names)) { $matched_orf = $orf_by_name{$matched_name}; if ($matches{$matched_orf} == $in{critnumber}) { print "$sep"; if (!$in{tabdelimited}) { print ""; } print "$matched_orf"; if (!$in{tabdelimited}) { print ""; } print "$sep"; print "$names{$matched_orf}$sep"; foreach $z (1..$in{critnumber}) { $crit = "crit" . $z; print "$$crit{$matched_orf}$sep"; } print "$paths{$matched_orf}$sep"; print "$descs{$matched_orf}$sep"; print "$line"; } } print $endformat; } #Do a gene expression search (open the appropriate data file and read in #all ORFs and their expression values. ORFs matching the criterion have #their value in the array %matches increased by 1.) sub DoExpSearch { $datafile = $in{"expt" . $i}; $moreless = $in{"moreless" . $i}; $induction = $in{"induction" . $i}; open(IN, "<$dir/minerdata/$datafile"); while () { chop; /^(.*)\t(.*)/; $orf = $1; $value = $2; $orf =~ tr/a-z/A-Z/; $orf =~ s/([WC])([AB])/$1-$2/; if ($value && (!$$crit{$orf})) { if ($moreless =~ /more/) { if ($value > $induction) { $matches{$orf}++; $$crit{$orf} = $value; } } else { if ($value < $induction) { $matches{$orf}++; $$crit{$orf} = $value; } } } } } #Do a MaxH search (open the MaxH data file and read in all ORFs and their #MaxH values. ORFs matching the criterion have their value in the array #%matches increased by 1.) sub DoMaxhSearch { $moreless = $in{"moreless" . $i}; $maxh = $in{"maxh" . $i}; open(IN, "<$dir/minerdata/orfs.maxh"); while () { chop; /^(.*)\t(.*)/; $orf = $1; $orf =~ tr/a-z/A-Z/; $value = $2; if ($value && (!$$crit{$orf})) { if ($moreless =~ /more/) { if ($value > $maxh) { $matches{$orf}++; $$crit{$orf} = $value; } } else { if ($value < $maxh) { $matches{$orf}++; $$crit{$orf} = $value; } } } } } #Do a protein motif search (open the protein sequence data file and read #in all ORFs and their predicted protein sequences. ORFs matching the #criterion have their value in the array %matches increased by 1.) sub DoMotifSearch { $motif = $in{"motif" . $i}; $motif =~ tr/a-z/A-Z/; $motif =~ s/[^A-Z0-9\,\.\*\-\{\}\[\]\^\+\?]//g; open(IN, "<$dir/minerdata/orfs.pep"); while () { chop; /^(.*)\t(.*)/; $orf = $1; $orf =~ tr/a-z/A-Z/; $pep = $2; if ($pep && (!$$crit{$orf})) { if ($pep =~ /($motif)/) { $matches{$orf}++; $$crit{$orf} = $1; } } } } #Do a promoter motif search (open the promoter sequence data file and read #in all ORFs and their predicted promoter sequences. ORFs matching the #criterion have their value in the array %matches increased by 1.) sub DoPromoterSearch { $promoter = $in{"promoter" . $i}; $promoter =~ tr/a-z/A-Z/; $promoter =~ s/[^0-9A-Z\,\.\*\-\{\}\[\]\^\+\?]//g; $promoter =~ s/N/./g; $promoter =~ s/Y/[CT]/g; $promoter =~ s/R/[AG]/g; $promoter =~ s/M/[AC]/g; $promoter =~ s/K/[GT]/g; $promoter =~ s/S/[GC]/g; $promoter =~ s/W/[AT]/g; open(IN, "<$dir/minerdata/orfs.prom"); while () { chop; /^(.*)\t(.*)/; $orf = $1; $orf =~ tr/a-z/A-Z/; $prom = $2; if ($prom && (!$$crit{$orf})) { if ($prom =~ /($promoter)/) { $matches{$orf}++; $$crit{$orf} = $1; } } } } #Do an isoelectric pt search (open the iso pt data file and read in #all ORFs and their products' predicted isoelectric points. ORFs matching #the criterion have their value in the array %matches increased by 1.) sub DoIsoSearch { $moreless = $in{"moreless" . $i}; $iso = $in{"iso" . $i}; open(IN, "<$dir/minerdata/orfs.iso"); while () { chop; /^(.*)\t(.*)/; $orf = $1; $orf =~ tr/a-z/A-Z/; $value = $2; if ($value && (!$$crit{$orf})) { if ($moreless =~ /more/) { if ($value > $iso) { $matches{$orf}++; $$crit{$orf} = $value; } } else { if ($value < $iso) { $matches{$orf}++; $$crit{$orf} = $value; } } } } } #Do a molecular weight search (open the mol wt data file and read in #all ORFs and their products' predicted masses. ORFs matching the #criterion have their value in the array %matches increased by 1.) sub DoMassSearch { $moreless = $in{"moreless" . $i}; $mass = $in{"mass" . $i}; open(IN, "<$dir/minerdata/orfs.mw"); while () { chop; /^(.*)\t(.*)/; $orf = $1; $orf =~ tr/a-z/A-Z/; $value = $2; if ($value && (!$$crit{$orf})) { if ($moreless =~ /more/) { if ($value > $mass) { $matches{$orf}++; $$crit{$orf} = $value; } } else { if ($value < $mass) { $matches{$orf}++; $$crit{$orf} = $value; } } } } } #Do a gene duplicates search (open the duplicates data file and read in #all ORFs and what duplicate block, if any, they are in. ORFs matching #the criterion have their value in the array %matches increased by 1.) sub DoDupsSearch { $want_dups = $in{"dups" . $i}; open(IN, "<$dir/minerdata/orfs.dups"); while () { chop; /^(.*)\t(.*)/; $orf = $1; $orf =~ tr/a-z/A-Z/; $dup_block = $2; if ((($want_dups eq "dup") && $dup_block && (!$$crit{$orf})) || (($want_dups eq "not_dup") && (!($dup_block)) && (!$$crit{$orf}))) { $matches{$orf}++; $$crit{$orf} = $dup_block; } } } sub PrintExp { open(LS, "ls $dir/minerdata/$dataclass*|"); @expts = ""; while () { chop; /minerdata\/$dataclass\.(.*)/; $suffix = $1; push(@expts, $suffix); } print "Criterion $x: Expression in $longcrit dataset
"; print ""; print "Genes which are induced "; print ""; print ""; print "-fold in the experiment "; print ""; } #Get gene names for all ORFs, store them in the %names array sub GetNames { open(IN, "<$dir/orfs.names"); while () { chop; /^(.*)\t(.*)/; $orf=$1; $name=$2; $names{$orf} = $name; } } #Get descriptions of what pathway each gene functions in, store them in #the %paths array sub GetPaths { open(IN, "<$dir/orfs.paths"); while () { chop; /^(.*)\t(.*)/; $orf=$1; $path=$2; $paths{$orf} = $path; } } #Get descriptions of the function of each gene, store them in the %descs #array sub GetDescs { open(IN, "<$dir/orfs.descs"); while () { chop; /^(.*)\t(.*)/; $orf=$1; $desc=$2; $descs{$orf} = $desc; } }