Structure to dadi format

I wrote this for a friend, but thought it may be useful to others out there.

It changes a structure format to the format they want to use for Diffusion Approximations for Demographic Inference (dadi).

https://github.com/paulirish/dadi

#dadi structure example
SNP Allele1 n-pop1 n-pop2 n-pop3 allele2 n-pop1 n-pop2 n-pop3 Gene Position
SNP1T/G T 27 22 43 G 3 2 1 a 1
SNP2C/T C 27 22 43 T 3 2 2 a 2
SNP3C/T C 29 24 44 T 1 0 6 a 3
SNP4C/T C 30 24 44 T 0 0 4 a 4
SNP5A/G A 3 2 1 G 27 22 0 a 5
SNP1C/T C 29 23 44 T 1 1 1 a 1
SNP6C/T C 2 2 0 T 28 22 8 a 6
SNP7C/G C 1 0 0 G 29 24 4 a 7
SNP8A/G A 30 24 43 G 0 0 34 a 8
SNP9A/C A 30 22 42 C 0 0 2 a 9

library(dplyr)
library(tidyverse)
library(xlsx)

#load data – in structure format
#Ind Pop SNP1…..
#ind1 AI 1…..
#ind1 AI 1…..
#ind2 AI 2…..
#ind2 AI 2…..
#ind3 KO 1…..
#ind3 KO 2…..
struc_df <- read.delim(“../Downloads/Phpmid_cl.stru”, header=FALSE)

#this can also be done in excel, but I try to avoid human error as much as possible
#Here just putting every SNP into their own col
SNPs <- str_split(string = struc_df$V3,pattern = ” “)
SNPs_df <- as.data.frame(SNPs)
SNPs_df <- t(SNPs_df)
colnames(SNPs_df) <- SNPs_df[1,]
SNPs_df <- SNPs_df[2:387,]
SNPs_df <- as.data.frame(SNPs_df)

#adding to metadata to the new df
MD <- Phpmid_cl[2:387,1:2]
df_joined <- cbind(MD,SNPs_df)
#removing ind col as it is not needed
df_joined_agr <- df_joined[,-1]
#gathering the df – SNP is the locus ID and ID is the allele 1/2/-9
x <- gather(df_joined_agr, key = “SNP”, “ID”, 2:4710)
#tallying the number of times ‘1’/’2’/’-9′ occurs per population
x <- x %>% group_by(V2, SNP, ID) %>% mutate(count = n())
#now cn remove the duplicates as the tally is overall but has been assigned to each individual
x <- unique(x)
#now create a dataframe that is spread out to get it closer to the needed dadi format
#first we want to know the count per allele – so get that info
x <- separate(x, 2, c(“1″,”2″,”3”), “-“,remove = F)
x <- x[,c(1:2,5:7)] #remove unwanted columns
x <- separate(x, 3, c(“Allele_1”, “Allele_2”), “/”) #separate A/G so we know Allele_1 vs Allele_2
x <- filter(x, ID != -9) #not interested in the missing data

#separate allele_1 vs allele_2 to associate tally with right allele
Allele_1 <- filter(x, ID == 1)
Allele_1 <- Allele_1[,c(2,1,3,6)]
Allele_1 <- spread(Allele_1, 2, 4)

#same for allele_2
Allele_2 <- filter(x, ID == 2)
Allele_2 <- Allele_2[,c(2,1,4,6)]
Allele_2 <- spread(Allele_2, 2, 4)

#then join them up again
Allele__joined <- full_join(Allele_1,Allele_2, by = “SNP”)
#NAs will be introduced and the dadi format requires that these are 0s
Allele__joined[is.na(Allele__joined)] <- 0

#write the file to use in dadi
write.xlsx(Allele__joined, “../Desktop/FarFar.xlsx”)

#this is a check where check dadi vs original
table(df_joined_agr$V2, df_joined_agr$`11742355-20-A/G`)

Share this: