SingleCell/Common_prep.py at main · christianwake/SingleCell · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import pandas as pd
import subprocess
import os
import re
from collections import defaultdict
import csv
import sys

configfile: 'project_config_file.yaml'
### Read configuration file
config = defaultdict(str, config)

print('Input QC name: ' + config['QC_name'])
QC_name = config['QC_name']
results_dir = os.path.join('results', QC_name) + '/'
log_dir = os.path.join(results_dir, 'logs/')

### Should we attempt automated cell type annotation?
do_annot = False
if(config['annotation_reference'] != ''):
  do_annot = True

### Which data types are included?
assays_list = ['RNA']
clusters_list = ['RNA_clusters']
prot = False
if(config['Data_type'] == 'CITESeq'):
  assays_list = assays_list + ['prot']
  clusters_list = clusters_list + ['prot_clusters', 'wnn_clusters']
  prot = True
if(do_annot):
  clusters_list = clusters_list + ['predicted.celltype.l1']

### Is the data sample-hashed?
dehash = config['hashed']
if dehash:
  methods = config['dehash_method_comparisons'].split(',')
  dehash_calls_tsv = defaultdict(str)
  dehash_CRint_calls_tsv = defaultdict(str)
  dehash_threshs_csv = defaultdict(str)
  for method in methods:
    ### If the input method is a file, lets assume they are hashing results from some custom method. For now, we have to assume the cell IDs will match those in the Cell Ranger outputs
    if(os.path.isfile(method)):
      dehash_threshs_csv[method] = method
      call_file = results_dir + 'Dehash/' + os.path.basename(re.sub("Dehash_threshs_", 'Dehash_calls_', re.sub('csv$', 'tsv', method)))
      dehash_calls_tsv[method] = call_file
      call_file = results_dir + 'Dehash/' + os.path.basename(re.sub("Dehash_threshs_", 'Dehash_CRint_calls_', re.sub('csv$', 'tsv', method)))
      dehash_CRint_calls_tsv[method] = call_file
    else: ### Otherwise, define the result output based on the input method name
      dehash_calls_tsv[method] = results_dir + 'Dehash/Dehash_calls_' + method + '.tsv'
      dehash_CRint_calls_tsv[method] = results_dir + 'Dehash/Dehash_CRint_calls_' + method + '.tsv'
      dehash_threshs_csv[method] = results_dir + 'Dehash/Dehash_threshs_' + method + '.csv'
else:
  dehash_calls_tsv = {}
  dehash_threshs_csv = {}
  dehash_CRint_calls_tsv = {}
print('\nDehash tsv files: ')
print(dehash_CRint_calls_tsv)

### If not input, assume the method to continue with is the first one input in the string of options
if(config['dehash_method'] == ''):
  utilized_method_name = config['dehash_method_comparisons'].split(',')[0]
else:
  utilized_method_name = config['dehash_method']
print('\nUsing dehash method ' + utilized_method_name)

### If LIBRASeq is not input set libraseq to F, otherwsie to T
libraseq = config['LIBRASeq'] != ''
### Libraseq probes
if libraseq:
  prot = True
  methods = config['libraseq_method_comparisons'].split(',')
  libraseq_calls_tsv = defaultdict(str)
  libraseq_threshs_csv = defaultdict(str)
  libraseq_calls_tsv_raw = defaultdict(str)
  libraseq_threshs_csv_raw = defaultdict(str)
  for method in methods:
    ### If the input method is a file, lets assume they are hashing results from some custom method. For now, we have to assume the cell IDs will match those in the Cell Ranger outputs
    if(os.path.isfile(method)):
      libraseq_threshs_csv[method] = method
      libraseq_file = results_dir + 'Libraseq/' + os.path.basename(re.sub("thresholds_", 'calls_', re.sub('csv$', 'tsv', method)))
      libraseq_calls_tsv[method] = libraseq_file
    else: ### Otherwise, define the result output based on the input method name
      libraseq_calls_tsv[method] = results_dir + 'Libraseq/norm/calls_' + method + '.tsv'
      libraseq_threshs_csv[method] = results_dir + 'Libraseq/norm/thresholds_' + method + '.csv'
      libraseq_calls_tsv_raw[method] = results_dir + 'Libraseq/raw/calls_' + method + '.tsv'
      libraseq_threshs_csv_raw[method] = results_dir + 'Libraseq/raw/thresholds_' + method + '.csv'
else:
  libraseq_calls_tsv = {}
  libraseq_threshs_csv = {}

### If not input, assume the method to continue with is the first one input in the string of options
if(config['libraseq_method'] == ''):
  libraseq_method_name = config['libraseq_method_comparisons'].split(',')[0]
else:
  libraseq_method_name = config['libraseq_method']
if(libraseq):
  print('Using Libraseq calling method: ' + libraseq_method_name)


### If the second cluster resolution is not specified in the config file, use the first cluster resolution.
if 'RNA_resolution2' not in config.keys():
  config['RNA_resolution2'] = config['RNA_resolution']
if 'prot_resolution2' not in config.keys():
  config['prot_resolution2'] = config['prot_resolution']

comp_dict = defaultdict(int)
if os.path.isfile(config['comparison_file']):
  with open(config['comparison_file'], newline = '') as csvfile:
    reader = csv.DictReader(csvfile)
    keyint = 0
    for row in reader:
      comp_dict[keyint] = row
      ### If no input strat_name, set dictionary values to 'All'
      if(comp_dict[keyint]['strat_name1'] == ''):
        comp_dict[keyint]['strat_name1'] = 'All'
        comp_dict[keyint]['strat_values1A'] = 'All'
        comp_dict[keyint]['strat_values1B'] = 'All'
      else: ### If strat_name is input, and strat_valueA is input but strat_value B is not, set the second value to the first
        if(comp_dict[keyint]['strat_values1A'] != '' and comp_dict[keyint]['strat_values1B'] == ''):
          comp_dict[keyint]['strat_values1B'] = comp_dict[keyint]['strat_values1A']
     ### If no input strat_name, set dictionary values to 'All'
      if(comp_dict[keyint]['strat_name2'] == ''):
        comp_dict[keyint]['strat_name2'] = 'All'
        comp_dict[keyint]['strat_values2A'] = 'All'
        comp_dict[keyint]['strat_values2B'] = 'All'
      else: ### If strat_name is input, and strat_valueA is input but strat_value B is not, set the second value to the first
        if(comp_dict[keyint]['strat_values2A'] != '' and comp_dict[keyint]['strat_values2B'] == ''):
          comp_dict[keyint]['strat_values2B'] = comp_dict[keyint]['strat_values2A']
      ### If either test value is empty, set to NA
      ### If either test value is empty, set to NA
      if(comp_dict[keyint]['value1'] == ''):
        comp_dict[keyint]['value1'] = 'NA'
      if(comp_dict[keyint]['value2'] == ''):
        comp_dict[keyint]['value2'] = 'NA'
      keyint += 1

### Read list of clusters to sub-cluster
sc_file = os.path.join(os.getcwd(), config['sub_cluster'])
print(sc_file)
if os.path.isfile(sc_file):
  sc = pd.read_csv(sc_file)
  ### Because cluster names are likely numbers only
  sc['cluster'] = sc['cluster'].astype(str)
  sub_clusters = ['-'.join(list(row)) for index,row in sc.iterrows()]
else:
 sub_clusters = []
print('Sub clustering: ')
print(sub_clusters)

### Read QC summary file (input to batch_eval checkpoint) and creates dictionary to hold the file paths held within
QC_file = os.path.join(os.getcwd(), config['QC_file'])
print(QC_file)
qcdat = pd.read_csv(QC_file)
print(qcdat)
#### Add missing steps to qcdat with '' file column, including step 0 (no QC done yet)
#step_names = ['Base data', 'DSB background', 'Sample filter', 'Imputation', 'Downsample', 'Transcript filter', 'Cell filter', 'Integration', 'Feature filter', 'Cluster filter']
#step_names = ['Base data', 'Sample filter', 'Imputation', 'Cell filter', 'Integration', 'Cluster filter']
steps = list(set(list(range(len(step_names)))) - set(qcdat.step))
d = {'step':steps, 'file':['' for s in steps]}
### If there are steps to add, add them. Conditional is necessary because merging an empty dictionary converts step numbers from str to num (for unknown reason)
if(len(steps) > 0):
  qcdat = pd.concat([qcdat, pd.DataFrame(d)])
### Reorder rows
qcdat['step_num'] = ['step' + str(q) for q in qcdat.step]
qcdat['post_name'] = [results_dir + 'PostQC' + str(q) + '.RDS' for q in qcdat.step]
qcdat = qcdat.set_index('step')
qcdat = qcdat.sort_index(ascending = True)
### Add step_name
print(step_names)
qcdat['step_name'] = step_names
### Add original RDS file
#qcdat.iloc[0, qcdat.columns.get_loc('post_name')] = 'data/All_data.RDS'
qcdat.iloc[0, qcdat.columns.get_loc('post_name')] = results_dir + 'All_data.RDS'
### Add skip information
#qcdat['skip'] = [f == '' for f in qcdat.file]
qcdat['skip'] = [not os.path.exists(f) for f in qcdat.file]
qcdat.iloc[0, qcdat.columns.get_loc('skip')] = False

print(qcdat)
print(qcdat[qcdat.step_num == 'step6'].file.values[0])
QC_specs = defaultdict(str, zip(qcdat.step_num, qcdat.file))

### For RStudio Connect publication ...sigh
if('username' not in config.keys()): config['username'] = 'NA'
if('server' not in config.keys()): config['server'] = 'NA'

### For dynamic resource requests
size_step = [8000, 15000, 64000, 240000]
def get_mem_mb(wildcards, attempt):
  return size_step[attempt]

### If the last line of the second output of cluster_filters (Filters.txt) specifies that 0 cells were filtered, skip re-clustering
def maybe_skip_second_cluster(wildcards):
  checkp = checkpoints.cluster_filter.get().output[1]
  print(checkp)
  with open(checkp) as f:
    for line in f:
      if(re.search('Total removed', line)):
        filtered_line = line
  print(filtered_line)
  n_filtered = filtered_line.split('\t')[-1] ### Last item in the line is n
  if(int(n_filtered) == 0):
    print('Skip second cluster.')
    return results_dir + 'Cluster_filtered.RDS'
  else:
    print('Do second cluster.')
    return results_dir + 'Clustered_again.RDS'

def maybe_skip_QC_stepN(qcdat, stepN, spec = True):
  QC_files = defaultdict(str, zip(qcdat.step_num, qcdat.file))
  ### Name of the output RDS file for this QC step
  QC_rds = dict(zip(qcdat.step_num, qcdat.post_name))
  ### File with specifics for this step
  specs_file = QC_files['step' + str(stepN)]
  ### Slice the data frame to include only up to input stepN
  qcdat = qcdat[:stepN]
  ### Select the step name of the last step that is not skipped (column 'skip' = F). Use that step name as key in the RDS dictionary.
  rds_file = QC_rds[qcdat[qcdat.skip == False].iloc[-1, qcdat.columns.get_loc('step_num')]]
  out = {'rds':rds_file, 'touch':'data/checkpoint.done'}
  if spec:
    out['qc'] = specs_file
  return(out)

def maybe_skip_annotation(do_annot):
  if(do_annot):
    out = results_dir + 'Mapped.RDS'
    print('Doing annotation')
  else:
    out = maybe_skip_second_cluster
  return out

def maybe_skip_integration(qcdat):
  QC_files = defaultdict(str, zip(qcdat.step_name, qcdat.skip))
  if(QC_files['Integration'] == True):
    out = results_dir + 'RNA_normalized.RDS'
  else:
    out = results_dir + 'PostQC4.RDS'
  return(out)

def aggregate_txt_inputs(wildcards):
  checkpoint_output = checkpoints.set_batches.get(**wildcards).output[0]
  txt = expand(results_dir + "batches/{i}/Cell_filtered.txt", i = glob_wildcards(os.path.join(checkpoint_output, "{i}", sheet_filename)).i)
  return txt