#!/usr/bin/perl use List::Util qw[min max]; #perl SFDRv14.pl -assoc Fusionp.txt -linkage FUSION_LOD.txt -SFDR -WFDR -out out.txt -top 10 -map RutgersMap.txt #Modified 9/16/2009 for v1.4 : add option for missing data symbol, checking pvalueis between 0 and 1 ####################Main step########################### #see subroutine main for the main procedures my %ioption; my %Zall; $version=1.4; &ReadCommandLine(); &OpenOutput(); my ($chref,$posref,$pallref,$snparrayref,$strataref, $Lscoreref,$locref,$BPmapref,$CMmapref)=&ReadInput(); &main($chref,$posref,$pallref,$snparrayref,$strataref, $Lscoreref,$locref,$BPmapref,$CMmapref); ########################################################### ################### SUBROUTINES############################## ########################################################### # Read command line argument and determine the options############################ sub ReadCommandLine { #default values and initiation $ioption{"-assoc"}=""; $ioption{"-linkage"}=""; $ioption{"-linkpeak"}=""; $ioption{"-map"}=""; # or a file $ioption{"-SFDR"}=""; $ioption{"-WFDR"}=""; $ioption{"-C"}=0.5; $ioption{"-B"}=1; $ioption{"-out"}=""; $ioption{"-top"}=10**8; $ioption{"-missing"}="."; #What about using peak lod information #header or no-header print STDOUT "\n>>>>SFDR v$version is beginning...<<<<\n"; #read options my $i; for ( $i = 0; $i <= $#ARGV; $i++ ) { if($ARGV[$i] eq "-assoc"){ $ioption{"-assoc"}=$ARGV[$i+1]; my $firstletter=substr $ARGV[$i+1],0,1; if ($ioption{"-assoc"} eq "" or $firstletter eq "-") {die "ERROR: No association file specified!\n";} } if($ARGV[$i] eq "-linkage"){ $ioption{"-linkage"}=$ARGV[$i+1]; my $firstletter=substr $ARGV[$i+1],0,1; if ($ioption{"-linkage"} eq "" or $firstletter eq "-") {die "ERROR: No linkage file specified!\n";} } if($ARGV[$i] eq "-linkpeak"){ $ioption{"-linkpeak"}=$ARGV[$i+1]; my $firstletter=substr $ARGV[$i+1],0,1; if ($ioption{"-linkpeak"} eq "" or $firstletter eq "-") {die "ERROR: No linkage peak file specified!\n";} } if($ARGV[$i] eq "-map"){ $ioption{"-map"}=$ARGV[$i+1]; my $firstletter=substr $ARGV[$i+1],0,1; if ($ioption{"-map"} eq "" or $firstletter eq "-") {$ioption{"-map"}="";} } if($ARGV[$i] eq "-SFDR"){ $ioption{"-SFDR"}="y"; } if($ARGV[$i] eq "-WFDR"){ $ioption{"-WFDR"}="y"; } if($ARGV[$i] eq "-C"){ $ioption{"-C"}=$ARGV[$i+1]; } if($ARGV[$i] eq "-B"){ $ioption{"-B"}=$ARGV[$i+1]; } if($ARGV[$i] eq "-out"){ $ioption{"-out"}=$ARGV[$i+1]; my $firstletter=substr $ARGV[$i+1],0,1; if ($firstletter eq "-") {die "ERROR: No output file specified!\n";} } if($ARGV[$i] eq "-top"){ $ioption{"-top"}=$ARGV[$i+1]; } if($ARGV[$i] eq "-missing"){ $ioption{"-missing"}=$ARGV[$i+1]; } } #Some error messages if(($ioption{"-WFDR"} eq "y" ) & $ioption{"-linkage"} eq ""){die "ERROR: No linkage file specified for WFDR!\n";} if($ioption{"-linkage"} ne "" & $ioption{"-map"} eq ""){die "ERROR: No map file specified for linkage data!\n";} if($ioption{"-linkpeak"} ne "" & $ioption{"-map"} eq ""){die "ERROR: No map file specified for linkage peak data!\n";} if($ioption{"-linkage"} ne "" & $ioption{"-linkpeak"} ne ""){die "ERROR: Cannot choose both linkage option and linkpeak option!\n";} if($ioption{"-WFDR"} eq "y" & $ioption{"-linkpeak"} ne ""){die "ERROR: WFDR cannot be done with only linkage peak info!\n";} #assign case number #1: do FDR only. no linkage data given if($ioption{"-linkage"} eq "" & $ioption{"-SFDR"} eq ""){ #1: do FDR only. no linkage data given $Case=1; print STDOUT "NOTE: No linkage data or strata data are given. Only FDR q-values will be computed.\n"; } #2: do FDR, SFDR and/or WFDR with linkage given if($ioption{"-linkage"} ne ""){ #2: do SFDR and/or WFDR with linkage given $Case=2; print STDOUT "NOTE: Linkage data are given. FDR"; if ($ioption{"-SFDR"} eq "y"){print STDOUT ",SFDR";} if ($ioption{"-WFDR"} eq "y"){print STDOUT ",WFDR";} print STDOUT " q-values will be computed.\n"; if ($ioption{"-SFDR"} eq "y" | $ioption{"-SFDR"} eq "y"){ print STDOUT "NOTE: Map file $ioption{-map} will be used for linkage data.\n"; } } #3: do SFDR with given strata if($ioption{"-linkage"} eq "" & $ioption{"-SFDR"} eq "y" ) { $Case=3; print STDOUT "NOTE: No linkage data are given. Group is assigned in the data. FDR,SFDR q-values will be computed.\n"; } #4: estimate SFDR strata from peak LOD info if($ioption{"-linkpeak"} ne ""){ $Case=4; print STDOUT "NOTE: Linkage peak data are given. FDR"; if ($ioption{"-SFDR"} eq "y"){print STDOUT ",SFDR";} print STDOUT " q-values will be computed.\n"; if ($ioption{"-SFDR"} eq "y"){ print STDOUT "NOTE: Map file $ioption{-map} will be used for linkage peak data.\n"; } } print STDOUT "NOTE: Missing P-values ($ioption{-missing} or blank) will be removed from the analysis.\n"; } #open output##################################################### sub OpenOutput{ my ($outref, $logref); #output file and log file if($ioption{"-out"} eq ""){ my @array=split(/\./,$ioption{"-assoc"}); for(my $i=0;$i<$#array;$i++){ if($i==0){$outref=$array[0];} else{$outref=$outref.".".$array[$i];} } $logref=$outref.".log"; $outref=$outref.".out"; print STDOUT "NOTE: No output file specified. $outref will be created.\n"; } else{ $outref=$ioption{"-out"}; my @array=split(/\./,$ioption{"-out"}); for(my $i=0;$i<$#array;$i++){ if($i==0){$logref=$array[0];} else{$logref=$logref.".".$array[$i];} } $logref=$logref.".log"; print STDOUT "NOTE: Output file $outref will be created.\n"; } print STDOUT "NOTE: Log file $logref will be created.\n"; open(out, ">$outref"); open(out2, ">$logref"); select out2; print ">>>>>>>SFDR v$version <<<<<<<<\n"; print "\nAssociation Data File: $ioption{-assoc}\n"; #1: do FDR only. no linkage data given if($Case==1){ #1: do FDR only. no linkage data given print "\nNOTE: No linkage data or strata data are given.\n"; print "NOTE: FDR will be done.\n"; } #2: do FDR, SFDR and/or WFDR with linkage given if($Case==2){ #2: do SFDR and/or WFDR with linkage given print "Linkage Data File: $ioption{-linkage}\n"; print "\nNOTE: FDR will be done\n"; if ($ioption{"-SFDR"} eq "y"){print "NOTE: SFDR will be done.\n";} if ($ioption{"-WFDR"} eq "y"){print "NOTE: WFDR will be done.\n";} if ($ioption{"-SFDR"} eq "y" | $ioption{"-WFDR"} eq "y"){ print "NOTE: Map file $ioption{-map} will be used for linkage data.\n"; } } #3: do SFDR with given strata if($Case==3) { print "\nNOTE: No linkage data are given. Strata are assigned in the data.\n"; print "NOTE: FDR will be done.\n"; print "NOTE: SFDR will be done.\n"; } #4: estimate SFDR strata from peak LOD info if($Case==4) { print "NOTE: Linkage peak data are given. FDR"; if ($ioption{"-SFDR"} eq "y"){print ",SFDR";} print " q-values will be computed.\n"; if ($ioption{"-SFDR"} eq "y"){ print "NOTE: Map file $ioption{-map} will be used for linkage peak data.\n"; } } #########output options############ if($ioption{"-top"}<1000000000) { print STDOUT "NOTE: Only SNPs with rank <= $ioption{-top} will be included in the output.\n"; print "NOTE: Only SNPs with rank <= $ioption{-top} will be included in the output.\n"; } } #read input data files : association input, linkage input, linkage map file####################### sub ReadInput{ my %Lscore;#linkage score hash my %loc;#likage location hash array my %BPmap;#bp map hash array my %CMmap; #cM map hash array my %Markers; #map marker names hash array ??? my %ch; #chr assignment for SNPs my %pos;#bp position assignment for SNPs by chr (array) my %pall; #association p-values my %snparray;#snp array by chr my %posbychr; #####read ASSOCIATION DATA################################################### open(ain,"<$ioption{-assoc}") or die "ERROR: cannot find $ioption{-assoc}!\n"; my $line=; my %varcol=findcolumn($line); my $sc=$varcol{"SNP"};#column for snp variable if (not exists $varcol{"SNP"}){die "ERROR: No variable named \"SNP\" in association data\n";} if (not exists $varcol{"CHR"}){die "ERROR: No variable named \"CHR\" in association data\n";} if (not exists $varcol{"POS"}){die "ERROR: No variable named \"SNP\" in association data\n";} if (not exists $varcol{"P"}){die "ERROR: No variable named \"SNP\" in association data\n";} my $snpind; my (%sind); my $multi=1; while(){ chomp; my @array=split(/[\t,\s]/,$_); #Missing data if($array[$varcol{"P"}] eq $ioption{"-missing"} or length($array[$varcol{"P"}]) ==0) {next;} if($array[$varcol{"P"}] > 1 or $array[$varcol{"P"}] <0 ) { die "ERROR: P-value ($array[$varcol{SNP}]) is not bewteen 0 and 1\n";} my $chr=$array[$varcol{"CHR"}]; $sind{$chr}++; $ch{$array[$sc]}=$chr; $posbychr{$chr}{$array[$sc]}=$array[$varcol{"POS"}]; $pall{$array[$sc]}=$array[$varcol{"P"}]; if ($Case ==3){ if(not exists $varcol{"GROUP"}){die "ERROR: You cannot do SFDR without group assignment or linkage data! (No variable named \"GROUP\" in the association data)\n";} $strata{$array[$sc]}=$array[$varcol{"GROUP"}]; } } close(ain); foreach my $k (sort(keys %posbychr)){ @{$snparray{$k}}= sort{$posbychr{$k}{$a} <=> $posbychr{$k}{$b}} keys %{$posbychr{$k}}; #print STDOUT "$k\t$#{$snparray{$k}}\n"; for(my $j=0;$j<=$#{$snparray{$k}};$j++){ $pos{$k}[$j]=$posbychr{$k}{$snparray{$k}[$j]}; } } my %posbychr; print STDOUT "**Association data have been read from $ioption{-assoc}\n"; #read LINKAGE data or Linkage peak data and map if( ($Case==2 or $Case==4) &($ioption{"-SFDR"} eq "y" | $ioption{"-WFDR"} eq "y")){ #####read LINKAGE DATA (chr cM LOD)######################################## if($Case==2){open(lin,"<$ioption{-linkage}") or die "ERROR: cannot find $ioption{-linkage}!\n";} if($Case==4){open(lin,"<$ioption{-linkpeak}") or die "ERROR: cannot find $ioption{-linkpeak}!\n";} my $line=; my %varcol=findcolumn($line); if (not exists $varcol{"CHR"}){die "ERROR: No variable named \"CHR\" in linkage data\n";} if (not exists $varcol{"CM"}){die "ERROR: No variable named \"cM\" in linkage data\n";} if ((not exists $varcol{"LOD"}) & (not exists $varcol{"NPL"})){die "ERROR: No variable named \"LOD\" or \"NPL\" in linkage data\n";} if((exists $varcol{"LOD"}) & (exists $varcol{"NPL"})){die "ERROR: You have both \"LOD\" and \"NPL\" in the data.\n";} my $Zcol; if (exists $varcol{"LOD"}){$Zcol=$varcol{"LOD"};} if (exists $varcol{"NPL"}){$Zcol=$varcol{"NPL"};} while(){ chomp; my @array=split(/[\t,\s]/,$_); $Lscore{$array[$varcol{"CHR"}]}{$array[$varcol{"CM"}]}=$array[$Zcol]; #0-chr, 1-cM, 2-Linkage } close(lin); if($Case==2){print STDOUT "**Linkage data have been read from $ioption{-linkage}\n";} if($Case==4){print STDOUT "**Linkage peak data have been read from $ioption{-linkpeak}\n";} #array of linkage points sorted by cM order foreach my $k (sort(keys %Lscore)){ @{$loc{$k}}=sort { $a <=> $b } keys %{$Lscore{$k}}; } #####read MAP######################################################## #Default map: Rutgers map build 36 #map is in RutgersMap.txt. It is in Kosambi map. The conversion to Haldane cM is performed if specified. #chr Marker cM bp (where Markers are either snp or ms) $mapref=$ioption{"-map"}; open(mp,"<$mapref") or die "ERROR: cannot find the $mapref!\n"; my $line=; my %varcol=findcolumn($line); if (not exists $varcol{"CHR"}){die "ERROR: No variable named \"CHR\" in linkage data\n";} if (not exists $varcol{"CM"}){die "ERROR: No variable named \"cM\" in linkage data\n";} if (not exists $varcol{"BP"}){die "ERROR: No variable named \"bp\" in linkage data\n";} #if (not exists $varcol{"MARKER"}){die "ERROR: No variable named \"MARKER\" in linkage data\n";} my %match; while(){ chomp; my @array=split; $ind{$array[0]}++; #$Markers{$array[0]}[$ind{$array[0]}-1]=$array[1]; $match{$array[$varcol{"CHR"}]}{$array[$varcol{"BP"}]}=$array[$varcol{"CM"}]; } close(mp); foreach my $k (sort(keys %match)){ @{$BPmap{$k}}=sort { $a <=> $b } keys %{$match{$k}}; for(my $l=0; $l<=$#{$BPmap{$k}};$l++){ $CMmap{$k}[$l]=$match{$k}{$BPmap{$k}[$l]}; } } print STDOUT "**Map data have been read\n"; } return (\%ch,\%pos,\%pall, \%snparray, \%strata,\%Lscore, \%loc, \%BPmap, \%CMmap); } #main procedure########################################################## sub main{ my ($chref,$posref,$pallref,$snparrayref,$strataref,$Lscoreref,$locref,$BPmapref,$CMmapref)=@_; my (@Z, @psub, @pcount); my (%W, %strata, %Zall); my ($SumW, $qF, $qW, $qS, $rF, $rW, $rS); select out2; #obtain group and weights (before standardization) from linkage scores if($Case<=2 or $Case==4){ my %stind; for(my $i=1;$i<=22;$i++){ if($ioption{"-SFDR"} eq "y" | $ioption{"-WFDR"} eq "y"){ if ($Case==2){@Z=&findZ(${$posref}{$i}, $i,${$BPmapref}{$i},${$CMmapref}{$i}, ${$Lscoreref}{$i}, ${$locref}{$i});} if ($Case==4){@Z=&findgroup(${$posref}{$i}, $i,${$BPmapref}{$i},${$CMmapref}{$i}, ${$Lscoreref}{$i}, ${$locref}{$i});} } #print STDOUT "Zsize:$i\t$#Z\n"; for(my $j=0;$j<=$#{${$snparrayref}{$i}};$j++){ my $snp=$snparrayref->{$i}[$j]; $Zall{$snp}=$Z[$j]; if(${$pallref}{$snp}>0.5){$pcount[0]++;} if($ioption{"-SFDR"} eq "y"){ if($Z[$j]>=$ioption{"-C"}){ $strata{$snp}=1; $psub[1]{$snp}=${$pallref}{$snp}; if($psub[1]{$snp}>0.5){$pcount[1]++;} #print STDOUT "$snparrayref->{$i}[$j]\t$Z[$j]\t$psub1{$snparrayref->{$i}[$j]}\n"; } elsif($Z[$j]<$ioption{"-C"}){ $strata{$snp}=2; $psub[2]{$snp}=${$pallref}{$snp}; if($psub[2]{$snp}>0.5){$pcount[2]++;} #print STDOUT "$snparrayref->{$i}[$j]\t$Z[$j]\t$psub1{$snparrayref->{$i}[$j]}\n"; } } if($ioption{"-WFDR"} eq "y"){ $W{$snp}=exp($ioption{"-B"}*$Z[$j]); $SumW+=$W{$snp}; #print STDOUT "$snparrayref->{$i}[$j]\t$W{$snparrayref->{$i}[$j]}\n"; } } } if($ioption{"-SFDR"} eq "y" | $ioption{"-WFDR"} eq "y"){ print STDOUT "**Interpolation of LOD scores done\n";} ($qF, $rF)=&FDR($pallref,$pcount[0]); print STDOUT "**FDR done\n"; if($ioption{"-WFDR"} eq "y"){ ($qW, $rW,$WW)=&WFDR($pallref,\%W,$SumW); print STDOUT "**WFDR done\n"; } if($ioption{"-SFDR"} eq "y"){ $stind{1}=1;$stind{2}=2; ($qS, $rS)=&SFDR($pallref,\@psub, \@pcount, \%stind); print STDOUT "**SFDR done\n"; } } if ($Case==3 ){ %strata=%{$strataref}; #assign strata index as 1,2,3... my %stind; my $ind; foreach my $k (keys %strata){ if (not exists $stind{$strata{$k}}){ $ind++; $stind{$strata{$k}}=$ind; } } for(my $i=1;$i<=22;$i++){ for($j=0;$j<=$#{${$snparrayref}{$i}};$j++){ my $snp=$snparrayref->{$i}[$j]; my $ind=$stind{$strata{$snp}}; $psub[$ind]{$snp}=${$pallref}{$snp}; if(${$pallref}{$snp}>0.5){$pcount[0]++;} if($psub[$ind]{$snp}>0.5){$pcount[$ind]++;} #print STDOUT "$stind\t$snp\t$psub[$stind]{$snp}\n"; } } ($qF, $rF)=&FDR($pallref,$pcount[0]); print STDOUT "**FDR done\n"; ($qS, $rS)=&SFDR($pallref,\@psub, \@pcount,\%stind); print STDOUT "**SFDR done\n"; } #print STDOUT "here${$qS}{rs2824245}\t${$rS}{rs2824245}\n"; #print STDOUT "size=$#{${$snparrayref}{1}}\n"; ################output################################################# select out; #header print "chr\tsnp\tpos\tp\tq_FDR\tr_FDR"; if($ioption{"-SFDR"} eq "y"){print "\tq_SFDR\tr_SFDR\tgroup";} if($ioption{"-WFDR"} eq "y"){print "\tq_WFDR\tr_WFDR\tweight";} if(($Case==2 | $Case==4) &( $ioption{"-SFDR"} eq "y" | $ioption{"-WFDR"} eq "y")){ print "\tZ"; } print "\n"; for(my $i=1;$i<=22;$i++){ for($j=0;$j<=$#{${$snparrayref}{$i}};$j++){ my $rankin; my $snp=$snparrayref->{$i}[$j]; my $top=0; if(${$rF}{$snp}<=$ioption{"-top"}){$top=1;} if($ioption{"-SFDR"} eq "y" & ${$rS}{$snp}<=$ioption{"-top"}) {$top=1;} if($ioption{"-WFDR"} eq "y" & ${$rW}{$snp}<=$ioption{"-top"}){$top=1;} if($top==0){next;} #default FDR print "${$chref}{$snp}\t$snp\t$posref->{$i}[$j]\t${$pallref}{$snp}"; printf "\t%5.4f", ${$qF}{$snp}; print "\t${$rF}{$snp}"; if($ioption{"-SFDR"} eq "y"){ printf "\t%5.4f", ${$qS}{$snp}; print "\t${$rS}{$snp}\t$strata{$snp}"; } if($ioption{"-WFDR"} eq "y"){ printf "\t%5.4f", ${$qW}{$snp}; print "\t${$rW}{$snp}"; printf "\t%5.4f", ${$WW}{$snp}; } if(($Case==2 | $Case==4) & ($ioption{"-SFDR"} eq "y" | $ioption{"-WFDR"} eq "y")){ printf "\t%.4f",$Zall{$snp}; } print "\n"; } } } #FDR################################################################## sub FDR{ my ($pallref,$pcount)=@_; my (%rall); my $l=keys %{$pallref}; my $phi0=phi0($pcount,$l); my ($qref,$SNPsortref) =&qvalue($pallref, $phi0); for (my $k=0;$k<=$#{$SNPsortref};$k++){ $rall{${$SNPsortref}[$k]}=$k+1; } select out2; print "\n>>FDR summary\n"; print "# of SNPs : $l\n"; return ($qref,\%rall); } #SFDR summary #strata: high, med, low #threshold of LOD: C #size of strata: num, num, num #SFDR################################################################## sub SFDR{ my ($pallref,$psubref, $pcountref, $stindref)=@_; my (%qSall,%rSall, %stsize); my @sthash; foreach my $k (keys %{$stindref}){ $sthash[${$stindref}{$k}]=$k; } for(my $k=1;$k<=$#{$psubref};$k++){ my $lsub=keys %{${$psubref}[$k]}; my $phi0=phi0(${$pcountref}[$k],$lsub); my ($qref,$SNPsortref) = &qvalue(${$psubref}[$k],$phi0); %qSall=(%qSall, %{$qref}); $stsize{$sthash[$k]}=$lsub; #$temp=keys %{$qref}; #print STDOUT "$k: $temp, phi0=$phi0\n"; } #$temp=keys %qSall; #print STDOUT "$k: $temp\n"; my @SNPsort=sort {$qSall{$a}<=>$qSall{$b}} keys %qSall; #print STDOUT "$#SNPsort\n"; %rSall=&rank(\@SNPsort,\%qSall,$pallref); #$temp=keys %rSall; #print STDOUT "rank: $temp\n"; select out2; print "\n>>SFDR summary\n"; print "Strata: "; for (my $k=1;$k<$#sthash;$k++){ print "$sthash[$k], "; } print "$sthash[$#sthash]\n"; print "Threshold C: $ioption{-C}\n"; print "Size of strata: "; for (my $k=1;$k<$#sthash;$k++){ print "$stsize{$sthash[$k]}, "; } print "$stsize{$sthash[$#sthash]}\n"; return (\%qSall, \%rSall); } #WFDR#################################################################### sub WFDR{ my ($pallref, $Wref, $SumW)=@_; my (%Wp); my ($wpcount,$phi0, $maxW,$minW); my $l=keys %{$Wref}; #print STDOUT "$l, ${$Wref}{rs2824245}\n"; $maxW=-1;$minW=10; foreach my $k (keys %{$Wref}){ ${$Wref}{$k}=${$Wref}{$k}/($SumW/$l); $Wp{$k}=${$pallref}{$k}/${$Wref}{$k}; #if ($Wp{$k}>1){$Wp{$k}= 1;} if(${$Wref}{$k}>$maxW){$maxW=${$Wref}{$k};} if(${$Wref}{$k}<$minW){$minW=${$Wref}{$k};} if ($Wp{$k}>0.5){$wpcount++;} } $phi0=&phi0($wpcount,$l); my ($qref,$SNPsortref) = &qvalue(\%Wp,$phi0); my %ranked=&rank($SNPsortref,$qref,\%Wp); my $averW=($SumW/$l); print "\n>>WFDR summary\n"; print "Weighting constant B: $ioption{-B}\n"; print "Average Weight: $averW\n"; print "Maximum Weight: $maxW\n"; print "Mininum Weight: $minW\n"; return($qref,\%ranked,$Wref); } #find Z scores for SNPs in each chromosome#################################################### sub findZ{ my ($posref, $chr, $BPmapref, $CMmapref, $Lref, $locref)=@_; my @posarray=@{$posref}; #by chr, association pos array my @BParray=@{$BPmapref}; #by chr, map BP array my @CMarray=@{$CMmapref}; #by chr, map CMarray my %L=%{$Lref}; #by chr my @locarray=@{$locref};#by chr my (@CMposarray,@Zarray); #print STDOUT "possize=$#posarray\tBPsize=$#BParray\n"; #find i and i+1 in the BParray my ($pnow,$bnow,$lnow, $int);#pnow, bnow progress in array comparison #int: indexfound; $pnow=0;$bnow=0;$lnow=0; while($pnow<=$#posarray){ #Step1: find location of the SNP in the BPmap while($posarray[$pnow]>$BParray[$bnow] & $bnow<$#BParray){ $bnow++; } $int=$bnow; #Step 2 :Interpolated CM my ($unit, $CMpos, $CM1, $CM2, $BP1, $BP2); if($int==0){$CMpos=$CMarray[$int];} elsif($int<=$#BParray & $posarray[$pnow]<=$BParray[$#BParray]){ $CM1=$CMarray[$int-1]; $CM2=$CMarray[$int]; $BP1=$BParray[$int-1]; $BP2=$BParray[$int]; $unit=($CM2-$CM1)/($BP2-$BP1); $CMpos=$CM1+$unit*($posarray[$pnow]-$BP1); } elsif($posarray[$pnow]>$BParray[$#BParray]){ $CMpos=$CMarray[$int]; } #Step 3: Find the location of CMpos in the @loc (linkage results) while($CMpos>$locarray[$lnow] & $lnow<$#locarray){ $lnow++; } my $locint=$lnow; #Step 4: Interpolate lod score and p-value my ($lodscore, $lod1,$lod2,$loc1,$loc2,$unit); if($locint==0){$lodscore=$L{$locarray[$locint]};} elsif ($locint<=$#locarray & $CMpos<=$locarray[$#locarray] ){ $lod1=$L{$locarray[$locint-1]}; $lod2=$L{$locarray[$locint]}; $loc1=$locarray[$locint-1]; $loc2=$locarray[$locint]; $unit=($lod2-$lod1)/($loc2-$loc1); $lodscore=$lod1+$unit*($CMpos-$loc1); #print STDOUT "h1:"; } elsif($CMpos> $locarray[$#locarray] ){ $lodscore=$L{$locarray[$locint]};#print STDOUT "h2:"; } $Zarray[$pnow]=$lodscore; #select out2; #print "$pnow $bnow: p-$posarray[$pnow] b-$BParray[$bnow] $CMpos $locint $lodscore\n"; #if($pnow>100){last;} $pnow++; } return @Zarray; } #find linkage group for SNPs in each chromosome#################################################### sub findgroup{ my ($posref, $chr, $BPmapref, $CMmapref, $Lref, $locref)=@_; my (@d,@lcM,@hcM); my @posarray=@{$posref}; #by chr, association pos array my @BParray=@{$BPmapref}; #by chr, map BP array my @CMarray=@{$CMmapref}; #by chr, map CMarray my %L=%{$Lref}; #by chr : peak LOD score for peak cM my @locarray=@{$locref}; my @Zarray; my ($pnow,$bnow,$lnow, $int);#pnow, bnow progress in array comparison #int: indexfound; $pnow=0;$bnow=0;$lnow=0; for (my $k=0;$k<=$#locarray;$k++){ ($d[$k],$lcM[$k],$hcM[$k])=®ion($L{$locarray[$k]},$locarray[$k],$ioption{"-C"}); #print STDOUT "$chr\t$k\t$d[$k],$lcM[$k],$hcM[$k]\n"; } while($pnow<=$#posarray){ my $lodscore=0; if(length($#locarray)==0){ $Zarray[$pnow]=$lodscore; next; } #Step1: find location of the SNP in the BPmap while($posarray[$pnow]>$BParray[$bnow] & $bnow<$#BParray){ $bnow++; } $int=$bnow; #Step 2 :Interpolated CM my ($unit, $CMpos, $CM1, $CM2, $BP1, $BP2); if($int==0){$CMpos=$CMarray[$int];} elsif($int<=$#BParray & $posarray[$pnow]<=$BParray[$#BParray]){ $CM1=$CMarray[$int-1]; $CM2=$CMarray[$int]; $BP1=$BParray[$int-1]; $BP2=$BParray[$int]; $unit=($CM2-$CM1)/($BP2-$BP1); $CMpos=$CM1+$unit*($posarray[$pnow]-$BP1); } elsif($posarray[$pnow]>$BParray[$#BParray]){ $CMpos=$CMarray[$int]; } #Step3: find the linkage score for high linkage region and low linkage region: low linkage region, it is all 0 for (my $k=0;$k<=$#locarray;$k++){ if($CMpos<=$hcM[$k] & $CMpos>$lcM[$k]){ $lodscore=exp(-0.08* abs($locarray[$k]-$CMpos))*$L{$locarray[$k]}; last; } } select out2; print "$pnow $bnow: p-$posarray[$pnow] b-$BParray[$bnow] $CMpos $locint $lodscore\n"; $Zarray[$pnow]=$lodscore; $pnow++; } return @Zarray; } #compute the phi0; pc-pcount, m-size of group############################################# sub phi0{ my ($pc,$m)=@_; my $p0; $p0=$pc/($m*0.5); $p0=min(1,$p0); return $p0; } #assign qvalues for group of p-values sub qvalue{ my ($pref, $phi0)=@_;; my %p=%{$pref}; my $m=keys %p; my @SNPsort=sort {$p{$a}<=>$p{$b}} keys %p; my %q; for(my $j=0;$j<=$#SNPsort;$j++){ $q{$SNPsort[$j]}=$phi0*$m*$p{$SNPsort[$j]}/($j+1); } for(my $j=$#SNPsort-1;$j>=0;$j--){ $q{$SNPsort[$j]}=min($q{$SNPsort[$j]},$q{$SNPsort[$j+1]},1); } return (\%q,\@SNPsort); } #find column numbers: varcol matches header name with array number################################# sub findcolumn{ my ($line)=@_; my %varcol; my @var=split(/[\t,\s]/,$line); for(my $n=0;$n<=$#var;$n++){ $var[$n]=~tr/a-z/A-Z/; $varcol{$var[$n]}=$n; } return %varcol; } #resort sorted array by resolving ties using pvalue ranks####################################### sub rank{ my ($aref,$qref,$pref)=@_; #aref-SortedSNP array ref, qref-qvalue, pref-pvalue my %qq=%{$qref}; my %pall=%{$pref}; my @aa=@{$aref}; my $start=0; my $end=0; my $qbefore=$qq{$aa[0]}; my %ranked=(); #chunk by chunk (chunk of same q-value) while($start<=$#aa){ my @asub=(); my $jj=0; for(my $j=$start;$j<=$#aa;$j++){ if($qq{$aa[$j]}==$qbefore){ $end=$j; $asub[$jj]=$aa[$j]; $qbefore=$qq{$aa[$j]}; $jj++; #print STDOUT "H1:$start\t$j\t$qbefore\n"; } else{ $qbefore=$qq{$aa[$j]}; #print STDOUT "H1:$start\t$j\t$qbefore\n"; last; } } #re-sort start to end by my %pp=(); @pp{@asub}=@pall{@asub}; my @sortedP=sort {$pp{$a}<=>$pp{$b}} keys %pp; for(my $j=0;$j<=$#sortedP;$j++){ $ranked{$sortedP[$j]}=$start+$j+1; } $start=$end+1; } return %ranked; } #K-kosambi (cM) #H-haldane (cM) #r-rf #r=(exp(4K*100)-1)/2(1+exp(4K*100)) #H=(-1/2)ln(1-2r)/100 #conversion from Kosambi cM to Haldane cM sub KtoH{ my ($KcM)=@_; my $r =0.5* (exp(4*$KcM/100)-1)/(1+exp(4*$KcM/100)); my $HcM= (-0.5) * log(1-2*$r)*100; return $HcM; } ##################estimate the region above threshold using peak LOD score######################### sub region{ my ($peakLOD, $peakCM, $cutLOD)=@_; if ($peakLOD==0){ $peakLOD=1e-6;} my $dCM=25*( log( sqrt($peakLOD)) - log(sqrt( $cutLOD)) ); my $lowCM=$peakCM-$dCM; my $highCM=$peakCM+$dCM; return ($dCM, $lowCM, $highCM); } #The end ################################################################################## #find index by optimazation rule sub findindex{ my ($position,$marrayref)=@_; my @marray=@{$marrayref}; my $indsize=$#marray; my $start=0; my $end=$indsize; my $mid; my $jump=0; my $findit=0; my $find; #print "$position\t$marray[0]\n"; if($position <=$marray[0] and $position>0){ $findit=1; return 0; } #print "$findit\there\n"; do{ $jump++; $mid=int(($start+$end)/2); if($position>=$marray[$mid]){ if($position<$marray[$mid+1]){ $findit=1; $find=$mid+1; } else {$start=$mid;} } if($position<$marray[$mid]){ if($position>=$marray[$mid-1]){ $findit=1; $find=$mid; } else {$end=$mid;} } }while($findit==0); return ($jump,$find); } #compute linkage score at any given bp position sub score2{ my ($position, $chr, $BPmapref,$CMmapref, $Lref, $locref)=@_; my @posarray=@{$BPmapref}; #by chr my @CMarray=@{$CMmapref}; #by chr my %L=%{$Lref}; #by chr my @locarray=@{$locref};#by chr my $int; #print "score:$#posarray\t$#CMarray\n"; #Find the i and i+1 keys out of pos array. $posarray[$#posarray+1]=1000000000; $int=findindex($position,\@posarray); #Interpolate the CM my $unit, $CMpos; if($int==0){ $CMpos=$CMarray[$int]; } elsif($int<$#posarray){ $CM1=$CMarray[$int-1]; $CM2=$CMarray[$int]; $pos1=$posarray[$int-1]; $pos2=$posarray[$int]; $unit=($CM2-$CM1)/($pos2-$pos1); $CMpos=$CM1+$unit*($position-$pos1); } elsif($int==$#posarray){ $CMpos=$CMarray[$int-1]; } #Find loc of CMpos in the linkage results my $locint; $locarray[$#locarray+1]=500; $locint=findindex($CMpos,\@locarray); my ($lodscore, $lod1,$lod2,$loc1,$loc2,$unit); #Interpolate lod score and p-value if($locint==0){ $lodscore=$L{$locarray[$locint]}; } elsif ($locint<$#locarray){ $lod1=$L{$locarray[$locint-1]}; $lod2=$L{$locarray[$locint]}; $loc1=$locarray[$locint-1]; $loc2=$locarray[$locint]; $unit=($lod2-$lod1)/($loc2-$loc1); $lodscore=$lod1+$unit*($CMpos-$loc1); } elsif($locint==$#locarray){ $lodscore=$L{$locarray[$locint-1]}; } #print STDOUT "$chr\t$position\t$CMpos\t$locarray[$locint]\t$lodscore\n"; return ($CMpos, $lodscore); } sub score{ my ($position, $chr, $BPmapref,$CMmapref, $Lref, $locref)=@_; my @posarray=@{$BPmapref}; #by chr my @CMarray=@{$CMmapref}; #by chr my %L=%{$Lref}; #by chr my @locarray=@{$locref};#by chr my $int; #print "score:$#posarray\t$#CMarray\n"; #Find the i and i+1 keys out of pos array. $posarray[$#posarray+1]=1000000000; for (my $j=0;$j<=$#posarray;$j++){ if($j==0){ if ($position<$posarray[$j] && $position>=0) { $int=$j; last; } } else { if($position<$posarray[$j] && $position >= $posarray[$j-1]){ $int=$j; last; } } } #Interpolate the CM my $unit, $CMpos; if($int==0){ $CMpos=$CMarray[$int]; } elsif($int<$#posarray){ $CM1=$CMarray[$int-1]; $CM2=$CMarray[$int]; $pos1=$posarray[$int-1]; $pos2=$posarray[$int]; $unit=($CM2-$CM1)/($pos2-$pos1); $CMpos=$CM1+$unit*($position-$pos1); } elsif($int==$#posarray){ $CMpos=$CMarray[$int-1]; } #Find loc of CMpos in the linkage results my $locint; $locarray[$#locarray+1]=500; for (my $j=0;$j<=$#locarray;$j++){ if($j==0){ if ($CMpos<=0) { $locint=$j; last; } } else { if($CMpos<$locarray[$j] &&$CMpos>= $locarray[$j-1]){ $locint=$j; last; } } } my ($lodscore, $lod1,$lod2,$loc1,$loc2,$unit); #Interpolate lod score and p-value if($locint==0){ $lodscore=$L{$locarray[$locint]}; } elsif ($locint<$#locarray){ $lod1=$L{$locarray[$locint-1]}; $lod2=$L{$locarray[$locint]}; $loc1=$locarray[$locint-1]; $loc2=$locarray[$locint]; $unit=($lod2-$lod1)/($loc2-$loc1); $lodscore=$lod1+$unit*($CMpos-$loc1); } elsif($locint==$#locarray){ $lodscore=$L{$locarray[$locint-1]}; } #print STDOUT "$chr\t$position\t$CMpos\t$locarray[$locint]\t$lodscore\n"; return ($CMpos, $lodscore); } sub temp{ open(out,">FUSIONp_st.txt"); select out; open(ain,"<$ioption{-assoc}") or die "ERROR: cannot find $ioption{-assoc}\n"; my $line=; chomp($line); print "$line\tstarta\n"; while(){ chomp; @array=split; if($array[3]>0.5){$st=1;} else {$st=0;} print "$array[0]\t$array[1]\t$array[2]\t$array[3]\t$st\n"; } } # SFDR strat assignment sub strata{ my %Zall; #linkage values for each SNP my %W; #WFDR weights for each SNP before scaling my %Wp;#weighted p-values for WFDR my %group;#SFDR group assignments my @psub; #pall divided into strata my @pcount; # >0.5 count within each psub , required for phi0 calculation for q-values my @phi0;#phi0 for groups my $SumW; #sum of Weights my $wpcount; my $phi0_W;#phi0 for WFDR if($ioption{"-linkage"} eq ""){ #Find interpolated linkage score for each SNP #Assign strata ($dum,$Zall{$array[$sc]})=&score2($array[$varcol{"pos"}],$chr,\@{$BPmap{$chr}},\@{$CMmap{$chr}}, \%{$Lscore{$chr}}, \@{$loc{$chr}}); $W{$array[$sc]}=exp($ioption{"-B"})*$Zall{$array[$sc]}; $SumW+=$W{$sc}; if($Zall{$array[$sc]}>$ioption{"-threshold"}){ $group{$array[$sc]}=1; $psub[1]{$array[$sc]}=$pall{$array[$sc]}; if($pall{$array[$sc]}>0.5){$pcount[1]++;$pcount[0]++;} } elsif($Zall{$array[$sc]}<=$ioption{"-threshold"}){ $group{$array[$sc]}=2; $psub[2]{$array[$sc]}=$pall{$array[$sc]}; if($pall{$array[$sc]}>0.5){$pcount[2]++;$pcount[0]++;} } } #if($tempind<100){ # print "$array[$sc]\t$pos{$array[$sc]}\t$pall{$array[$sc]}\t$Zall{$array[$sc]}\n"; # #} #else{last;} if($ioption{"-linkage"} eq ""){ #Assign starta by strata variable } #$snpind++; #print STDOUT "$snpind\n"; #if($snpind==(10000*$multi)){print STDOUT "$multi ";$multi++;} print STDOUT "\n"; close(ain); ##test $chr=1; &findZ(\@{$pos{$chr}}, $chr,\@{$BPmap{$chr}},\@{$CMmap{$chr}}, \%{$Lscore{$chr}}, \@{$loc{$chr}}); #Compute the WFDR weights and obtain %WP -weighted p-values my $l=keys %Zall; foreach my $k (keys %W){ $W{$k}=$W{$k}/($SumW/$l); $Wp{$k}=$pall{$k}/$W{$k}; #if ($Wp{$k}>1){$Wp{$k}= 1;} if ($Wp{$k}>0.5){$wpcount++;} } #compute phio for all groups $phi0[0]=phi0($pcount[0],$l);#pall for($k=1;$k<=$#psub;$k++){#psub my $lsub=keys %{$psub[$k]}; $phi0[$k]=phi0($pcount[$k],$lsub); } $phi0_W=phi0($wpcount,$l); print "$phi0[0]\t$phi0[1]\t$phi0[2]\t$phi0_W\n"; #refresh following from the memory my %Lscore;#linkage score hash my %loc;#likage location hash array my %BPmap;#bp map hash array my %CMmap; #cM map hash array my %Markers; #map marker names hash array return (\%ch,\%pos,\%pall,\%Zall,\%W,\%group,\@psub,\@pcount); }