process_genomic.py now take into account the different genotypes of h…

…omozygous offspring to infer mother genotypes.
cmdoret · Oct 6, 2018 · 862d54d · 862d54d
1 parent 0cbafb2
commit 862d54d
Showing 1 changed file with 36 additions and 51 deletions.
diff --git a/src/assoc_mapping/process_genomic.py b/src/assoc_mapping/process_genomic.py
@@ -33,21 +33,20 @@
  populations output files. Used as input')
 parser.add_argument('out', type=str,
  help='Folder where output will be written')
-parser.add_argument('grouped_input', type=str,
- help='If all families were grouped into a \
- single populations run, set to "T", \
- otherwise set to "F"')
 parser.add_argument('--keep_all', action='store_true',
  help='Keep all SNPs, even if missing or\
  homozygous in the mother.')
 parser.add_argument('--pool_output', action='store_true',
  help='Pool output instead of computing \
  proportions per family')
+parser.add_argument('--sample_list', type=str,
+ help='File containing the list of samples with family, \
+ name, sex and generation information. Defaults to \
+ "data/individuals.tsv".', default='data/individuals.tsv')
 
 args = parser.parse_args()
 
 # ========== DEFINING FUNCTIONS ==========#
-sum_file = "../../data/populations/fam_d-20_r-80/A/batch_0.sumstats.tsv"
 
 
 def unify_genomic(pop_path, pop):
@@ -112,8 +111,6 @@ def gen_decode(encoded):
  This function decodes numeric genotypes and
  replaces it with E (heterozygous), O (homozygous)
  or M (missing).
- :param encoded: pandas dataframe, respecting the genomics output structure
- from STACKS populations module, only the genotype columns must be included.
  :returns: A dataframe containing the genotype letters.
  """
 
@@ -166,8 +163,7 @@ def mother_hom(geno, pop):
 
  # Include SNP if number of genotypes > 1 (het = auto include)
  fam_SNP = fam_SNP[fam_SNP < 2].index.tolist()
- # fam_SNP = np.where(np.all(geno[fam.Name.tolist()].isin(['O', 'M']),
- # axis=1))[0]
+
  # Change those sites to M in all indv with same family
  geno.loc[fam_SNP, fam.Name] = 'M'
  # If using pandas <0.19, the above line will fail. The loop below can
@@ -198,10 +194,9 @@ def prop_hom(pop, geno):
  # Get sample names by sex
  sex_id = {'M': pop.Name[pop.Sex == 'M'],
  'F': pop.Name[pop.Sex == 'F']}
-
+ 
  # Un-specify the genotype of homozygous individuals (to comptabilise recomb)
  geno = geno.replace(['O1','O5','O8', 'O10'], "O")
- print(geno.iloc[1:50,])
  # Counting how many individuals are used to compute proportion at each SNP
  sample_size = {} # Number of individuals in which each site is found
  hom = {} # proportion of individuals in which each site is homozygous
@@ -284,61 +279,52 @@ def parallel_func(f, df, f_args=[], chunk_size=1000):
 
 # ========== LOADING AND PROCESSING DATA ==========#
 # Path to STACKS populations folder and output file
-# in_path = "data/populations/grouped_d-5_r-80/"
 
 
 in_path = args.pop_files
-out_prefix = 'grouped_' if args.grouped_input == "T" else "fam_"
+out_prefix = 'grouped_'
 if args.pool_output:
  out_prefix += "outpool_"
 if args.keep_all:
  out_prefix += "keep_"
 out_path = path.join(args.out, (out_prefix + "prophom.tsv"))
-indv_path = "data/individuals.tsv" # family and sex information
+indv_path = args.sample_list # family and sex information
 indv = pd.read_csv(indv_path, sep='\t') # Family and sex info
 # Preparing data structure to match sample names and families with columns
 
-if args.grouped_input == 'T': # Single populations folder
- try:
- # Names in correct order
- samples = pd.read_csv(path.join(in_path, "batch_1.sumstats.tsv"),
- sep='\t', nrows=2, header=None)
-
- # Concatenating populations
- names = samples.iloc[:, 1][0].split(',') + \
- samples.iloc[:, 1][1].split(',')
- except pd.errors.ParserError:
- # In case only 1 sex is present
- samples = pd.read_csv(path.join(in_path, "batch_1.sumstats.tsv"),
- sep='\t', nrows=1, header=None)
-
- # Concatenating populations
- names = samples.iloc[:, 1][0].split(',')
-
- names = pd.DataFrame({'Name': names})
- # Adding family and sex, keeping order
- pop = names.merge(indv, on='Name', how='left')
- genomic = pd.read_csv(path.join(in_path, "batch_1.genomic.tsv"),
- sep='\t', header=None, skiprows=1)
- # select only samples cols and reindex from 0
- gen_indv = genomic.iloc[:, 3:].T.reset_index(drop=True).T
- # Replacing numeric header with corresponding sample name
- gen_indv.rename(columns=lambda x: pop.Name[x], inplace=True)
- print("Processing {0} sites across {1} samples.".format(gen_indv.shape[0],
- gen_indv.shape[1]))
-else: # One populations folder per family
- pop = indv
- genomic = unify_genomic(in_path, pop)
- pop.drop(pop.index[~pop.Name.isin(genomic.columns.values[3:])],
- axis=0, inplace=True)
- pop = pop.reset_index(drop=True)
- gen_indv = genomic.iloc[:, 3:]
+
+try:
+ # Names in correct order
+ samples = pd.read_csv(path.join(in_path, "batch_1.sumstats.tsv"),
+ sep='\t', nrows=2, header=None)
+
+ # Concatenating populations
+ names = samples.iloc[:, 1][0].split(',') + \
+ samples.iloc[:, 1][1].split(',')
+except pd.errors.ParserError:
+ # In case only 1 sex is present
+ samples = pd.read_csv(path.join(in_path, "batch_1.sumstats.tsv"),
+ sep='\t', nrows=1, header=None)
+
+ # Concatenating populations
+ names = samples.iloc[:, 1][0].split(',')
+
+names = pd.DataFrame({'Name': names})
+# Adding family and sex, keeping order
+pop = names.merge(indv, on='Name', how='left')
+genomic = pd.read_csv(path.join(in_path, "batch_1.genomic.tsv"),
+ sep='\t', header=None, skiprows=1)
+# select only samples cols and reindex from 0
+gen_indv = genomic.iloc[:, 3:].T.reset_index(drop=True).T
+# Replacing numeric header with corresponding sample name
+gen_indv.rename(columns=lambda x: pop.Name[x], inplace=True)
+print("Processing {0} sites across {1} samples.".format(gen_indv.shape[0],
+ gen_indv.shape[1]))
 print("files loaded")
 
 # ========== RUNNING CODE ==========#
 # Decoding numeric genotypes into states (het, hom, missing)
 state = parallel_func(gen_decode, gen_indv)
-
 print("genotypes decoded")
 # Will run unless user explicitly set the --keep_all parameter
 if not args.keep_all:
@@ -355,7 +341,6 @@ def parallel_func(f, df, f_args=[], chunk_size=1000):
 # ========== SAVING OUTPUT ==========#
 # Merging Chromosomal positions with proportion of homozygosity into 1 df
 prop = genomic.iloc[:, 0: 3].merge(prop, left_index=True, right_index=True)
-
 state = genomic.iloc[:, 0: 3].merge(state, left_index=True, right_index=True)
 prop.rename(columns={0: "Locus.ID", 1: "Chr", 2: "BP"}, inplace=True)
 state.rename(columns={0: "Locus.ID", 1: "Chr", 2: "BP"}, inplace=True)