dag9_tenx_single_cell_immune_profiling.py 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086
  1. from datetime import timedelta
  2. import os,json,logging,subprocess
  3. from airflow.models import DAG,Variable
  4. from airflow.utils.dates import days_ago
  5. from airflow.operators.python_operator import PythonOperator
  6. from airflow.operators.python_operator import BranchPythonOperator
  7. from airflow.operators.dummy_operator import DummyOperator
  8. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import fetch_analysis_info_and_branch_func
  9. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import configure_cellranger_run_func
  10. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_sc_read_trimmming_func
  11. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_cellranger_tool
  12. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import decide_analysis_branch_func
  13. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_result_to_db_func
  14. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import ftp_files_upload_for_analysis
  15. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import irods_files_upload_for_analysis
  16. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_singlecell_notebook_wrapper_func
  17. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_analysis_files_func
  18. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import task_branch_function
  19. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import upload_analysis_file_to_box
  20. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import convert_bam_to_cram_func
  21. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_picard_for_cellranger
  22. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_samtools_for_cellranger
  23. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_multiqc_for_cellranger
  24. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import index_and_copy_bam_for_parallel_analysis
  25. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import change_pipeline_status
  26. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import clean_up_files
  27. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import create_and_update_qc_pages
  28. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_metrices_to_collection
  29. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import generate_cell_sorted_bam_func
  30. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_velocyto_func
  31. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_scvelo_for_sc_5p_func
  32. ## ARGS
  33. default_args = {
  34. 'owner': 'airflow',
  35. 'depends_on_past': False,
  36. 'start_date': days_ago(2),
  37. 'email_on_failure': False,
  38. 'email_on_retry': False,
  39. 'retries': 4,
  40. 'max_active_runs':10,
  41. 'catchup':True,
  42. 'retry_delay': timedelta(minutes=5),
  43. 'provide_context': True,
  44. }
  45. FEATURE_TYPE_LIST = \
  46. Variable.get('tenx_single_cell_immune_profiling_feature_types',default_var={})#.split(',')
  47. ## DAG
  48. dag = \
  49. DAG(
  50. dag_id='dag9_tenx_single_cell_immune_profiling',
  51. schedule_interval=None,
  52. tags=['hpc','analysis','tenx','sc'],
  53. default_args=default_args,
  54. concurrency=100,
  55. max_active_runs=20,
  56. orientation='LR')
  57. with dag:
  58. ## TASK
  59. fetch_analysis_info_and_branch = \
  60. BranchPythonOperator(
  61. task_id='fetch_analysis_info',
  62. dag=dag,
  63. queue='hpc_4G',
  64. params={'no_analysis_task':'no_analysis',
  65. 'analysis_description_xcom_key':'analysis_description',
  66. 'analysis_info_xcom_key':'analysis_info'},
  67. python_callable=fetch_analysis_info_and_branch_func)
  68. ## TASK
  69. configure_cellranger_run = \
  70. PythonOperator(
  71. task_id='configure_cellranger_run',
  72. dag=dag,
  73. queue='hpc_4G',
  74. trigger_rule='none_failed_or_skipped',
  75. params={'xcom_pull_task_id':'fetch_analysis_info',
  76. 'analysis_description_xcom_key':'analysis_description',
  77. 'analysis_info_xcom_key':'analysis_info',
  78. 'library_csv_xcom_key':'cellranger_library_csv'},
  79. python_callable=configure_cellranger_run_func)
  80. for analysis_name in FEATURE_TYPE_LIST.keys():
  81. ## TASK
  82. task_branch = \
  83. BranchPythonOperator(
  84. task_id=analysis_name,
  85. dag=dag,
  86. queue='hpc_4G',
  87. params={'xcom_pull_task_id':'fetch_analysis_info',
  88. 'analysis_info_xcom_key':'analysis_info',
  89. 'analysis_name':analysis_name,
  90. 'task_prefix':'run_trim'},
  91. python_callable=task_branch_function)
  92. run_trim_list = list()
  93. for run_id in range(0,10):
  94. ## TASK
  95. t = \
  96. PythonOperator(
  97. task_id='run_trim_{0}_{1}'.format(analysis_name,run_id),
  98. dag=dag,
  99. queue='hpc_4G',
  100. params={'xcom_pull_task_id':'fetch_analysis_info',
  101. 'analysis_info_xcom_key':'analysis_info',
  102. 'analysis_description_xcom_key':'analysis_description',
  103. 'analysis_name':analysis_name,
  104. 'run_id':run_id,
  105. 'r1-length':0,
  106. 'r2-length':0,
  107. 'fastq_input_dir_tag':'fastq_dir',
  108. 'use_ephemeral_space':True,
  109. 'fastq_output_dir_tag':'output_path'},
  110. python_callable=run_sc_read_trimmming_func)
  111. run_trim_list.append(t)
  112. ## TASK
  113. collect_trimmed_files = \
  114. DummyOperator(
  115. task_id='collect_trimmed_files_{0}'.format(analysis_name),
  116. trigger_rule='none_failed_or_skipped',
  117. dag=dag)
  118. ## PIPELINE
  119. fetch_analysis_info_and_branch >> task_branch
  120. task_branch >> run_trim_list
  121. run_trim_list >> collect_trimmed_files
  122. collect_trimmed_files >> configure_cellranger_run
  123. ## TASK
  124. no_analysis = \
  125. DummyOperator(
  126. task_id='no_analysis',
  127. dag=dag)
  128. ## PIPELINE
  129. fetch_analysis_info_and_branch >> no_analysis
  130. ## TASK
  131. run_cellranger = \
  132. PythonOperator(
  133. task_id='run_cellranger',
  134. dag=dag,
  135. queue='hpc_64G16t24hr',
  136. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  137. 'analysis_description_xcom_key':'analysis_description',
  138. 'library_csv_xcom_key':'cellranger_library_csv',
  139. 'library_csv_xcom_pull_task':'configure_cellranger_run',
  140. 'cellranger_xcom_key':'cellranger_output',
  141. 'cellranger_options':['--localcores 16','--localmem 64']},
  142. python_callable=run_cellranger_tool)
  143. ## PIPELINE
  144. configure_cellranger_run >> run_cellranger
  145. ## TASK
  146. decide_analysis_branch = \
  147. BranchPythonOperator(
  148. task_id='decide_analysis_branch',
  149. dag=dag,
  150. queue='hpc_4G',
  151. python_callable=decide_analysis_branch_func,
  152. params={'load_cellranger_result_to_db_task':'load_cellranger_result_to_db',
  153. 'run_scanpy_for_sc_5p_task':'run_scanpy_for_sc_5p',
  154. 'run_scirpy_for_vdj_task':'run_scirpy_for_vdj',
  155. 'run_scirpy_for_vdj_b_task':'run_scirpy_for_vdj_b',
  156. 'run_scirpy_vdj_t_task':'run_scirpy_for_vdj_t',
  157. 'run_seurat_for_sc_5p_task':'run_seurat_for_sc_5p',
  158. 'convert_cellranger_bam_to_cram_task':'convert_cellranger_bam_to_cram',
  159. 'library_csv_xcom_key':'cellranger_library_csv',
  160. 'library_csv_xcom_pull_task':'configure_cellranger_run'})
  161. ## PIPELINE
  162. run_cellranger >> decide_analysis_branch
  163. ## TASK
  164. load_cellranger_result_to_db = \
  165. PythonOperator(
  166. task_id='load_cellranger_result_to_db',
  167. dag=dag,
  168. queue='hpc_4G',
  169. python_callable=load_cellranger_result_to_db_func,
  170. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  171. 'analysis_description_xcom_key':'analysis_description',
  172. 'cellranger_xcom_key':'cellranger_output',
  173. 'cellranger_xcom_pull_task':'run_cellranger',
  174. 'collection_type':'CELLRANGER_MULTI',
  175. 'html_collection_type':'CELLRANGER_HTML',
  176. 'collection_table':'sample',
  177. 'xcom_collection_name_key':'sample_igf_id',
  178. 'genome_column':'genome_build',
  179. 'analysis_name':'cellranger_multi',
  180. 'output_xcom_key':'loaded_output_files',
  181. 'html_xcom_key':'html_report_file',
  182. 'html_report_file_name':'web_summary.html'})
  183. upload_cellranger_report_to_ftp = \
  184. PythonOperator(
  185. task_id='upload_cellranger_report_to_ftp',
  186. dag=dag,
  187. queue='hpc_4G',
  188. python_callable=ftp_files_upload_for_analysis,
  189. params={'xcom_pull_task':'load_cellranger_result_to_db',
  190. 'xcom_pull_files_key':'html_report_file',
  191. 'collection_name_task':'load_cellranger_result_to_db',
  192. 'collection_name_key':'sample_igf_id',
  193. 'collection_type':'FTP_CELLRANGER_HTML',
  194. 'collection_table':'sample',
  195. 'collect_remote_file':True})
  196. upload_cellranger_report_to_box = \
  197. PythonOperator(
  198. task_id='upload_cellranger_report_to_box',
  199. dag=dag,
  200. queue='hpc_4G',
  201. python_callable=upload_analysis_file_to_box,
  202. params={'xcom_pull_task':'load_cellranger_result_to_db',
  203. 'xcom_pull_files_key':'html_report_file',
  204. 'analysis_tag':'cellranger_multi'})
  205. upload_cellranger_results_to_irods = \
  206. PythonOperator(
  207. task_id='upload_cellranger_results_to_irods',
  208. dag=dag,
  209. queue='hpc_4G',
  210. python_callable=irods_files_upload_for_analysis,
  211. params={'xcom_pull_task':'load_cellranger_result_to_db',
  212. 'xcom_pull_files_key':'loaded_output_files',
  213. 'collection_name_key':'sample_igf_id',
  214. 'collection_name_task':'load_cellranger_result_to_db',
  215. 'analysis_name':'cellranger_multi'})
  216. ## PIPELINE
  217. decide_analysis_branch >> load_cellranger_result_to_db
  218. load_cellranger_result_to_db >> upload_cellranger_report_to_ftp
  219. load_cellranger_result_to_db >> upload_cellranger_report_to_box
  220. load_cellranger_result_to_db >> upload_cellranger_results_to_irods
  221. ## TASK
  222. run_scanpy_for_sc_5p = \
  223. PythonOperator(
  224. task_id='run_scanpy_for_sc_5p',
  225. dag=dag,
  226. queue='hpc_4G',
  227. python_callable=run_singlecell_notebook_wrapper_func,
  228. params={'cellranger_xcom_key':'cellranger_output',
  229. 'cellranger_xcom_pull_task':'run_cellranger',
  230. 'scanpy_timeout':1200,
  231. 'allow_errors':False,
  232. 'kernel_name':'python3',
  233. 'count_dir':'count',
  234. 'analysis_name':'scanpy',
  235. 'output_notebook_key':'scanpy_notebook',
  236. 'output_cellbrowser_key':'cellbrowser_dirs',
  237. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  238. 'analysis_description_xcom_key':'analysis_description'})
  239. load_cellranger_gex_matrics_to_db = \
  240. PythonOperator(
  241. task_id='load_cellranger_gex_matrics_to_db',
  242. dag=dag,
  243. queue='hpc_4G',
  244. python_callable=load_cellranger_metrices_to_collection,
  245. params={'cellranger_xcom_key':'cellranger_output',
  246. 'cellranger_xcom_pull_task':'run_cellranger',
  247. 'collection_type':'CELLRANGER_MULTI',
  248. 'collection_name_task':'load_cellranger_result_to_db',
  249. 'collection_name_key':'sample_igf_id',
  250. 'metrics_summary_file':'count/metrics_summary.csv',
  251. 'attribute_prefix':'CELLRANGER_COUNT'})
  252. load_scanpy_report_for_sc_5p_to_db = \
  253. PythonOperator(
  254. task_id='load_scanpy_report_for_sc_5p_to_db',
  255. dag=dag,
  256. queue='hpc_4G',
  257. python_callable=load_analysis_files_func,
  258. params={'collection_name_task':'load_cellranger_result_to_db',
  259. 'collection_name_key':'sample_igf_id',
  260. 'file_name_task':'run_scanpy_for_sc_5p',
  261. 'file_name_key':'scanpy_notebook',
  262. 'analysis_name':'scanpy_5p',
  263. 'collection_type':'SCANPY_HTML',
  264. 'collection_table':'sample',
  265. 'output_files_key':'output_db_files'})
  266. upload_scanpy_report_for_sc_5p_to_ftp = \
  267. PythonOperator(
  268. task_id='upload_scanpy_report_for_sc_5p_to_ftp',
  269. dag=dag,
  270. queue='hpc_4G',
  271. python_callable=ftp_files_upload_for_analysis,
  272. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  273. 'xcom_pull_files_key':'output_db_files',
  274. 'collection_name_task':'load_cellranger_result_to_db',
  275. 'collection_name_key':'sample_igf_id',
  276. 'collection_type':'FTP_SCANPY_HTML',
  277. 'collection_table':'sample',
  278. 'collect_remote_file':True})
  279. upload_scanpy_report_for_sc_5p_to_box = \
  280. PythonOperator(
  281. task_id='upload_scanpy_report_for_sc_5p_to_box',
  282. dag=dag,
  283. queue='hpc_4G',
  284. python_callable=upload_analysis_file_to_box,
  285. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  286. 'xcom_pull_files_key':'output_db_files',
  287. 'analysis_tag':'scanpy_single_sample_report'})
  288. upload_cellbrowser_for_sc_5p_to_ftp = \
  289. PythonOperator(
  290. task_id='upload_cellbrowser_for_sc_5p_to_ftp',
  291. dag=dag,
  292. queue='hpc_4G',
  293. python_callable=ftp_files_upload_for_analysis,
  294. params={'xcom_pull_task':'run_scanpy_for_sc_5p',
  295. 'xcom_pull_files_key':'cellbrowser_dirs',
  296. 'collection_name_task':'load_cellranger_result_to_db',
  297. 'collection_name_key':'sample_igf_id',
  298. 'collection_type':'FTP_CELLBROWSER',
  299. 'collection_table':'sample',
  300. 'collect_remote_file':True})
  301. ## PIPELINE
  302. decide_analysis_branch >> run_scanpy_for_sc_5p
  303. run_scanpy_for_sc_5p >> load_scanpy_report_for_sc_5p_to_db
  304. run_scanpy_for_sc_5p >> load_cellranger_gex_matrics_to_db
  305. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_ftp
  306. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_box
  307. run_scanpy_for_sc_5p >> upload_cellbrowser_for_sc_5p_to_ftp
  308. ## TASK
  309. run_scirpy_for_vdj_b = \
  310. PythonOperator(
  311. task_id='run_scirpy_for_vdj_b',
  312. dag=dag,
  313. queue='hpc_4G',
  314. python_callable=run_singlecell_notebook_wrapper_func,
  315. params={'cellranger_xcom_key':'cellranger_output',
  316. 'cellranger_xcom_pull_task':'run_cellranger',
  317. 'scanpy_timeout':1200,
  318. 'allow_errors':False,
  319. 'kernel_name':'python3',
  320. 'analysis_name':'scirpy',
  321. 'vdj_dir':'vdj_b',
  322. 'count_dir':'count',
  323. 'output_notebook_key':'scirpy_notebook',
  324. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  325. 'analysis_description_xcom_key':'analysis_description'})
  326. load_cellranger_vdjB_matrics_to_db = \
  327. PythonOperator(
  328. task_id='load_cellranger_vdjB_matrics_to_db',
  329. dag=dag,
  330. queue='hpc_4G',
  331. python_callable=load_cellranger_metrices_to_collection,
  332. params={'cellranger_xcom_key':'cellranger_output',
  333. 'cellranger_xcom_pull_task':'run_cellranger',
  334. 'collection_type':'CELLRANGER_MULTI',
  335. 'collection_name_task':'load_cellranger_result_to_db',
  336. 'collection_name_key':'sample_igf_id',
  337. 'metrics_summary_file':'vdj_b/metrics_summary.csv',
  338. 'attribute_prefix':'CELLRANGER_VDJB'})
  339. load_scirpy_report_for_vdj_b_to_db = \
  340. PythonOperator(
  341. task_id='load_scirpy_report_for_vdj_b_to_db',
  342. dag=dag,
  343. queue='hpc_4G',
  344. python_callable=load_analysis_files_func,
  345. params={'collection_name_task':'load_cellranger_result_to_db',
  346. 'collection_name_key':'sample_igf_id',
  347. 'file_name_task':'run_scirpy_for_vdj_b',
  348. 'file_name_key':'scirpy_notebook',
  349. 'analysis_name':'scirpy_vdj_b',
  350. 'collection_type':'SCIRPY_VDJ_B_HTML',
  351. 'collection_table':'sample',
  352. 'output_files_key':'output_db_files'})
  353. upload_scirpy_report_for_vdj_b_to_ftp = \
  354. PythonOperator(
  355. task_id='upload_scirpy_report_for_vdj_b_to_ftp',
  356. dag=dag,
  357. queue='hpc_4G',
  358. python_callable=ftp_files_upload_for_analysis,
  359. params={'xcom_pull_task':'load_scirpy_report_for_vdj_b_to_db',
  360. 'xcom_pull_files_key':'output_db_files',
  361. 'collection_name_task':'load_cellranger_result_to_db',
  362. 'collection_name_key':'sample_igf_id',
  363. 'collection_type':'FTP_SCIRPY_VDJ_B_HTML',
  364. 'collection_table':'sample',
  365. 'collect_remote_file':True})
  366. upload_scirpy_report_for_vdj_b_to_box = \
  367. PythonOperator(
  368. task_id='upload_scanpy_report_for_vdj_b_to_box',
  369. dag=dag,
  370. queue='hpc_4G',
  371. python_callable=upload_analysis_file_to_box,
  372. params={'xcom_pull_task':'load_scirpy_report_for_vdj_b_to_db',
  373. 'xcom_pull_files_key':'output_db_files',
  374. 'analysis_tag':'scirpy_vdj_b_single_sample_report'})
  375. ## PIPELINE
  376. decide_analysis_branch >> run_scirpy_for_vdj_b
  377. run_scirpy_for_vdj_b >> load_scirpy_report_for_vdj_b_to_db
  378. run_scirpy_for_vdj_b >> load_cellranger_vdjB_matrics_to_db
  379. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_ftp
  380. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_box
  381. ## TASK
  382. run_scirpy_for_vdj_t = \
  383. PythonOperator(
  384. task_id='run_scirpy_for_vdj_t',
  385. dag=dag,
  386. queue='hpc_4G',
  387. python_callable=run_singlecell_notebook_wrapper_func,
  388. params={'cellranger_xcom_key':'cellranger_output',
  389. 'cellranger_xcom_pull_task':'run_cellranger',
  390. 'scanpy_timeout':1200,
  391. 'allow_errors':False,
  392. 'kernel_name':'python3',
  393. 'analysis_name':'scirpy',
  394. 'vdj_dir':'vdj_t',
  395. 'count_dir':'count',
  396. 'output_notebook_key':'scirpy_notebook',
  397. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  398. 'analysis_description_xcom_key':'analysis_description'})
  399. load_cellranger_vdjT_matrics_to_db = \
  400. PythonOperator(
  401. task_id='load_cellranger_vdjT_matrics_to_db',
  402. dag=dag,
  403. queue='hpc_4G',
  404. python_callable=load_cellranger_metrices_to_collection,
  405. params={'cellranger_xcom_key':'cellranger_output',
  406. 'cellranger_xcom_pull_task':'run_cellranger',
  407. 'collection_type':'CELLRANGER_MULTI',
  408. 'collection_name_task':'load_cellranger_result_to_db',
  409. 'collection_name_key':'sample_igf_id',
  410. 'metrics_summary_file':'vdj_t/metrics_summary.csv',
  411. 'attribute_prefix':'CELLRANGER_VDJT'})
  412. load_scirpy_report_for_vdj_t_to_db = \
  413. PythonOperator(
  414. task_id='load_scirpy_report_for_vdj_t_to_db',
  415. dag=dag,
  416. queue='hpc_4G',
  417. python_callable=load_analysis_files_func,
  418. params={'collection_name_task':'load_cellranger_result_to_db',
  419. 'collection_name_key':'sample_igf_id',
  420. 'file_name_task':'run_scirpy_for_vdj_t',
  421. 'file_name_key':'scirpy_notebook',
  422. 'analysis_name':'scirpy_vdj_t',
  423. 'collection_type':'SCIRPY_VDJ_T_HTML',
  424. 'collection_table':'sample',
  425. 'output_files_key':'output_db_files'})
  426. upload_scirpy_report_for_vdj_t_to_ftp = \
  427. PythonOperator(
  428. task_id='upload_scirpy_report_for_vdj_t_to_ftp',
  429. dag=dag,
  430. queue='hpc_4G',
  431. python_callable=ftp_files_upload_for_analysis,
  432. params={'xcom_pull_task':'load_scirpy_report_for_vdj_t_to_db',
  433. 'xcom_pull_files_key':'output_db_files',
  434. 'collection_name_task':'load_cellranger_result_to_db',
  435. 'collection_name_key':'sample_igf_id',
  436. 'collection_type':'FTP_SCIRPY_VDJ_T_HTML',
  437. 'collection_table':'sample',
  438. 'collect_remote_file':True})
  439. upload_scirpy_report_for_vdj_t_to_box = \
  440. PythonOperator(
  441. task_id='upload_scirpy_report_for_vdj_t_to_box',
  442. dag=dag,
  443. queue='hpc_4G',
  444. python_callable=upload_analysis_file_to_box,
  445. params={'xcom_pull_task':'load_scirpy_report_for_vdj_t_to_db',
  446. 'xcom_pull_files_key':'output_db_files',
  447. 'analysis_tag':'scirpy_vdj_t_single_sample_report'})
  448. ## PIPELINE
  449. decide_analysis_branch >> run_scirpy_for_vdj_t
  450. run_scirpy_for_vdj_t >> load_scirpy_report_for_vdj_t_to_db
  451. run_scirpy_for_vdj_t >> load_cellranger_vdjT_matrics_to_db
  452. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_ftp
  453. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_box
  454. ## TASK
  455. run_seurat_for_sc_5p = \
  456. PythonOperator(
  457. task_id='run_seurat_for_sc_5p',
  458. dag=dag,
  459. queue='hpc_4G',
  460. python_callable=run_singlecell_notebook_wrapper_func,
  461. params={'cellranger_xcom_key':'cellranger_output',
  462. 'cellranger_xcom_pull_task':'run_cellranger',
  463. 'scanpy_timeout':1200,
  464. 'allow_errors':False,
  465. 'kernel_name':'ir',
  466. 'analysis_name':'seurat',
  467. 'vdj_dir':'vdj',
  468. 'count_dir':'count',
  469. 'output_notebook_key':'seurat_notebook',
  470. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  471. 'analysis_description_xcom_key':'analysis_description'})
  472. load_seurat_report_for_sc_5p_db = \
  473. PythonOperator(
  474. task_id='load_seurat_report_for_sc_5p_db',
  475. dag=dag,
  476. queue='hpc_4G',
  477. python_callable=load_analysis_files_func,
  478. params={'collection_name_task':'load_cellranger_result_to_db',
  479. 'collection_name_key':'sample_igf_id',
  480. 'file_name_task':'run_seurat_for_sc_5p',
  481. 'file_name_key':'seurat_notebook',
  482. 'analysis_name':'seurat_5p',
  483. 'collection_type':'SEURAT_HTML',
  484. 'collection_table':'sample',
  485. 'output_files_key':'output_db_files'})
  486. upload_seurat_report_for_sc_5p_ftp = \
  487. PythonOperator(
  488. task_id='upload_seurat_report_for_sc_5p_ftp',
  489. dag=dag,
  490. queue='hpc_4G',
  491. python_callable=ftp_files_upload_for_analysis,
  492. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  493. 'xcom_pull_files_key':'output_db_files',
  494. 'collection_name_task':'load_cellranger_result_to_db',
  495. 'collection_name_key':'sample_igf_id',
  496. 'collection_type':'FTP_SEURAT_HTML',
  497. 'collection_table':'sample',
  498. 'collect_remote_file':True})
  499. upload_seurat_report_for_sc_5p_to_box = \
  500. PythonOperator(
  501. task_id='upload_seurat_report_for_sc_5p_to_box',
  502. dag=dag,
  503. queue='hpc_4G',
  504. python_callable=upload_analysis_file_to_box,
  505. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  506. 'xcom_pull_files_key':'output_db_files',
  507. 'analysis_tag':'seurat_single_sample_report'})
  508. ## PIPELINE
  509. decide_analysis_branch >> run_seurat_for_sc_5p
  510. run_seurat_for_sc_5p >> load_seurat_report_for_sc_5p_db
  511. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_ftp
  512. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_to_box
  513. ## TASK
  514. convert_cellranger_bam_to_cram = \
  515. PythonOperator(
  516. task_id='convert_cellranger_bam_to_cram',
  517. dag=dag,
  518. queue='hpc_4G4t',
  519. python_callable=convert_bam_to_cram_func,
  520. params={'xcom_pull_files_key':'cellranger_output',
  521. 'xcom_pull_task':'run_cellranger',
  522. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  523. 'analysis_description_xcom_key':'analysis_description',
  524. 'use_ephemeral_space':True,
  525. 'threads':4,
  526. 'analysis_name':'cellranger',
  527. 'collection_type':'ANALYSIS_CRAM',
  528. 'collection_table':'sample',
  529. 'cram_files_xcom_key':'cram_files'})
  530. ## PIPELINE
  531. decide_analysis_branch >> convert_cellranger_bam_to_cram
  532. ## TASK
  533. generate_cell_sorted_bam = \
  534. PythonOperator(
  535. task_id='generate_cell_sorted_bam',
  536. dag=dag,
  537. queue='hpc_16G8t',
  538. python_callable=generate_cell_sorted_bam_func,
  539. params={'xcom_pull_task': 'run_cellranger',
  540. 'xcom_pull_files_key': 'cellranger_output',
  541. 'cellranger_bam_path': 'count/possorted_genome_bam.bam',
  542. 'cellsorted_bam_path': 'count/cellsorted_possorted_genome_bam.bam',
  543. 'samtools_mem': '2G',
  544. 'threads': 7})
  545. run_velocyto = \
  546. PythonOperator(
  547. task_id='run_velocyto',
  548. queue='hpc_16G_long',
  549. python_callable=run_velocyto_func,
  550. params={'xcom_pull_task': 'run_cellranger',
  551. 'xcom_pull_files_key': 'cellranger_output',
  552. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  553. 'analysis_description_xcom_key': 'analysis_description' })
  554. run_scvelo_for_sc_5p = \
  555. PythonOperator(
  556. task_id='run_scvelo_for_sc_5p',
  557. dag=dag,
  558. queue='hpc_16G8t',
  559. python_callable=run_scvelo_for_sc_5p_func,
  560. params={'xcom_pull_task': 'run_cellranger',
  561. 'xcom_pull_files_key': 'cellranger_output',
  562. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  563. 'analysis_description_xcom_key': 'analysis_description',
  564. 'loom_file_key': 'loom_output',
  565. 'loom_file_task': 'run_velocyto',
  566. 'timeout': 1200,
  567. 'allow_errors': False,
  568. 'cpu_threads': 8,
  569. 'output_notebook_key': 'scvelo_notebook'})
  570. load_loom_file_to_rds = \
  571. PythonOperator(
  572. task_id='load_loom_file_to_rds',
  573. dag=dag,
  574. queue='hpc_4G',
  575. python_callable=load_analysis_files_func,
  576. params={'collection_name_task':'load_cellranger_result_to_db',
  577. 'collection_name_key':'sample_igf_id',
  578. 'file_name_task':'run_velocyto',
  579. 'file_name_key':'loom_output',
  580. 'analysis_name':'velocyto_5p',
  581. 'collection_type':'VELOCYTO_LOOM',
  582. 'collection_table':'sample',
  583. 'output_files_key':'output_db_files'})
  584. upload_loom_file_to_irods = \
  585. PythonOperator(
  586. task_id='upload_loom_file_to_irods',
  587. dag=dag,
  588. queue='hpc_4G',
  589. python_callable=irods_files_upload_for_analysis,
  590. params={'xcom_pull_task':'load_loom_file_to_rds',
  591. 'xcom_pull_files_key':'output_db_files',
  592. 'collection_name_key':'sample_igf_id',
  593. 'collection_name_task':'load_cellranger_result_to_db',
  594. 'analysis_name':'velocyto_loom'})
  595. load_scvelo_report_to_rds = \
  596. PythonOperator(
  597. task_id='load_scvelo_report_to_rds',
  598. dag=dag,
  599. queue='hpc_4G',
  600. python_callable=load_analysis_files_func,
  601. params={'collection_name_task':'load_cellranger_result_to_db',
  602. 'collection_name_key':'sample_igf_id',
  603. 'file_name_task':'run_scvelo_for_sc_5p',
  604. 'file_name_key':'scvelo_notebook',
  605. 'analysis_name':'scvelo_5p',
  606. 'collection_type':'SCVELO_HTML',
  607. 'collection_table':'sample',
  608. 'output_files_key':'output_db_files'})
  609. upload_scvelo_report_to_ftp = \
  610. PythonOperator(
  611. task_id='upload_scvelo_report_to_ftp',
  612. dag=dag,
  613. queue='hpc_4G',
  614. python_callable=ftp_files_upload_for_analysis,
  615. params={'xcom_pull_task':'load_scvelo_report_to_rds',
  616. 'xcom_pull_files_key':'output_db_files',
  617. 'collection_name_task':'load_cellranger_result_to_db',
  618. 'collection_name_key':'sample_igf_id',
  619. 'collection_type':'FTP_SCVELO_HTML',
  620. 'collection_table':'sample',
  621. 'collect_remote_file':True})
  622. upload_scvelo_report_to_box = \
  623. PythonOperator(
  624. task_id='upload_scvelo_report_to_box',
  625. dag=dag,
  626. queue='hpc_4G',
  627. python_callable=upload_analysis_file_to_box,
  628. params={'xcom_pull_task':'load_scvelo_report_to_rds',
  629. 'xcom_pull_files_key':'output_db_files',
  630. 'analysis_tag':'scvelo_single_sample_report'})
  631. ## PIPELINE
  632. convert_cellranger_bam_to_cram >> generate_cell_sorted_bam
  633. generate_cell_sorted_bam >> run_velocyto
  634. run_velocyto >> run_scvelo_for_sc_5p
  635. run_velocyto >> load_loom_file_to_rds
  636. load_loom_file_to_rds >> upload_loom_file_to_irods
  637. run_scvelo_for_sc_5p >> load_scvelo_report_to_rds
  638. load_scvelo_report_to_rds >> upload_scvelo_report_to_ftp
  639. load_scvelo_report_to_rds >> upload_scvelo_report_to_box
  640. ## TASK
  641. copy_bam_for_parallel_runs = \
  642. BranchPythonOperator(
  643. task_id='copy_bam_for_parallel_runs',
  644. dag=dag,
  645. queue='hpc_4G',
  646. python_callable=index_and_copy_bam_for_parallel_analysis,
  647. params={'xcom_pull_files_key':'cellranger_output',
  648. 'xcom_pull_task':'run_cellranger',
  649. 'list_of_tasks':[
  650. 'run_picard_alignment_summary',
  651. 'run_picard_qual_summary',
  652. 'run_picard_rna_summary',
  653. 'run_picard_gc_summary',
  654. 'run_picard_base_dist_summary',
  655. 'run_samtools_stats']})
  656. ## TASK
  657. upload_cram_to_irods = \
  658. PythonOperator(
  659. task_id='upload_cram_to_irods',
  660. dag=dag,
  661. queue='hpc_4G',
  662. python_callable=irods_files_upload_for_analysis,
  663. params={'xcom_pull_task':'convert_cellranger_bam_to_cram',
  664. 'xcom_pull_files_key':'cram_files',
  665. 'collection_name_key':'sample_igf_id',
  666. 'collection_name_task':'load_cellranger_result_to_db',
  667. 'analysis_name':'cellranger_multi'})
  668. ## PIPELINE
  669. #convert_cellranger_bam_to_cram >> copy_bam_for_parallel_runs # we need to load metrics to cram
  670. generate_cell_sorted_bam >> copy_bam_for_parallel_runs
  671. convert_cellranger_bam_to_cram >> upload_cram_to_irods
  672. ## TASK
  673. run_picard_alignment_summary = \
  674. PythonOperator(
  675. task_id='run_picard_alignment_summary',
  676. dag=dag,
  677. queue='hpc_4G',
  678. python_callable=run_picard_for_cellranger,
  679. params={'xcom_pull_files_key':'run_picard_alignment_summary',
  680. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  681. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  682. 'analysis_description_xcom_key':'analysis_description',
  683. 'use_ephemeral_space':True,
  684. 'load_metrics_to_cram':True,
  685. 'java_param':'-Xmx4g',
  686. 'picard_command':'CollectAlignmentSummaryMetrics',
  687. 'picard_option':{},
  688. 'analysis_files_xcom_key':'picard_alignment_summary',
  689. 'bam_files_xcom_key':None})
  690. ## PIPELINE
  691. copy_bam_for_parallel_runs >> run_picard_alignment_summary
  692. ## TASK
  693. cleanup_picard_alignment_summary_input = \
  694. PythonOperator(
  695. task_id='cleanup_picard_alignment_summary_input',
  696. dag=dag,
  697. queue='hpc_4G',
  698. python_callable=clean_up_files,
  699. params={'xcom_pull_files_key':'run_picard_alignment_summary',
  700. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  701. ## PIPELINE
  702. run_picard_alignment_summary >> cleanup_picard_alignment_summary_input
  703. ## TASK
  704. upload_picard_alignment_summary_to_box = \
  705. PythonOperator(
  706. task_id='upload_picard_alignment_summary_to_box',
  707. dag=dag,
  708. queue='hpc_4G',
  709. python_callable=upload_analysis_file_to_box,
  710. params={'xcom_pull_task':'run_picard_alignment_summary',
  711. 'xcom_pull_files_key':'picard_alignment_summary',
  712. 'analysis_tag':'Picard-CollectAlignmentSummaryMetrics'})
  713. ## PIPELINE
  714. run_picard_alignment_summary >> upload_picard_alignment_summary_to_box
  715. ## TASK
  716. run_picard_qual_summary = \
  717. PythonOperator(
  718. task_id='run_picard_qual_summary',
  719. dag=dag,
  720. queue='hpc_4G',
  721. python_callable=run_picard_for_cellranger,
  722. params={'xcom_pull_files_key':'run_picard_qual_summary',
  723. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  724. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  725. 'analysis_description_xcom_key':'analysis_description',
  726. 'use_ephemeral_space':True,
  727. 'load_metrics_to_cram':True,
  728. 'java_param':'-Xmx4g',
  729. 'picard_command':'QualityScoreDistribution',
  730. 'picard_option':{},
  731. 'analysis_files_xcom_key':'picard_qual_summary',
  732. 'bam_files_xcom_key':None})
  733. ## PIPELINE
  734. copy_bam_for_parallel_runs >> run_picard_qual_summary
  735. ## TASK
  736. cleanup_picard_qual_summary_input = \
  737. PythonOperator(
  738. task_id='cleanup_picard_qual_summary_input',
  739. dag=dag,
  740. queue='hpc_4G',
  741. python_callable=clean_up_files,
  742. params={'xcom_pull_files_key':'run_picard_qual_summary',
  743. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  744. ## PIPELINE
  745. run_picard_qual_summary >> cleanup_picard_qual_summary_input
  746. ## TASK
  747. upload_picard_qual_summary_to_box = \
  748. PythonOperator(
  749. task_id='upload_picard_qual_summary_to_box',
  750. dag=dag,
  751. queue='hpc_4G',
  752. python_callable=upload_analysis_file_to_box,
  753. params={'xcom_pull_task':'run_picard_qual_summary',
  754. 'xcom_pull_files_key':'picard_qual_summary',
  755. 'analysis_tag':'Picard-QualityScoreDistribution'})
  756. ## PIPELINE
  757. run_picard_qual_summary >> upload_picard_qual_summary_to_box
  758. ## TASK
  759. run_picard_rna_summary = \
  760. PythonOperator(
  761. task_id='run_picard_rna_summary',
  762. dag=dag,
  763. queue='hpc_8G',
  764. python_callable=run_picard_for_cellranger,
  765. params={'xcom_pull_files_key':'run_picard_rna_summary',
  766. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  767. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  768. 'analysis_description_xcom_key':'analysis_description',
  769. 'use_ephemeral_space':True,
  770. 'load_metrics_to_cram':True,
  771. 'java_param':'-Xmx7g',
  772. 'picard_command':'CollectRnaSeqMetrics',
  773. 'picard_option':{},
  774. 'analysis_files_xcom_key':'picard_rna_summary',
  775. 'bam_files_xcom_key':None})
  776. ## PIPELINE
  777. copy_bam_for_parallel_runs >> run_picard_rna_summary
  778. ## TASK
  779. cleanup_picard_rna_summary_input = \
  780. PythonOperator(
  781. task_id='cleanup_picard_rna_summary_input',
  782. dag=dag,
  783. queue='hpc_4G',
  784. python_callable=clean_up_files,
  785. params={'xcom_pull_files_key':'run_picard_rna_summary',
  786. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  787. ## PIPELINE
  788. run_picard_rna_summary >> cleanup_picard_rna_summary_input
  789. ## TASK
  790. upload_picard_rna_summary_to_box = \
  791. PythonOperator(
  792. task_id='upload_picard_rna_summary_to_box',
  793. dag=dag,
  794. queue='hpc_4G',
  795. python_callable=upload_analysis_file_to_box,
  796. params={'xcom_pull_task':'run_picard_rna_summary',
  797. 'xcom_pull_files_key':'picard_rna_summary',
  798. 'analysis_tag':'Picard-CollectRnaSeqMetrics'})
  799. ## PIPELINE
  800. run_picard_rna_summary >> upload_picard_rna_summary_to_box
  801. ## TASK
  802. run_picard_gc_summary = \
  803. PythonOperator(
  804. task_id='run_picard_gc_summary',
  805. dag=dag,
  806. queue='hpc_4G',
  807. python_callable=run_picard_for_cellranger,
  808. params={'xcom_pull_files_key':'run_picard_gc_summary',
  809. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  810. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  811. 'analysis_description_xcom_key':'analysis_description',
  812. 'use_ephemeral_space':True,
  813. 'load_metrics_to_cram':True,
  814. 'java_param':'-Xmx4g',
  815. 'picard_command':'CollectGcBiasMetrics',
  816. 'picard_option':{},
  817. 'analysis_files_xcom_key':'picard_gc_summary',
  818. 'bam_files_xcom_key':None})
  819. ## PIPELINE
  820. copy_bam_for_parallel_runs >> run_picard_gc_summary
  821. ## TASK
  822. cleanup_picard_gc_summary_input = \
  823. PythonOperator(
  824. task_id='cleanup_picard_gc_summary_input',
  825. dag=dag,
  826. queue='hpc_4G',
  827. python_callable=clean_up_files,
  828. params={'xcom_pull_files_key':'run_picard_gc_summary',
  829. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  830. ## PIPELINE
  831. run_picard_gc_summary >> cleanup_picard_gc_summary_input
  832. ## TASK
  833. upload_picard_gc_summary_to_box = \
  834. PythonOperator(
  835. task_id='upload_picard_gc_summary_to_box',
  836. dag=dag,
  837. queue='hpc_4G',
  838. python_callable=upload_analysis_file_to_box,
  839. params={'xcom_pull_task':'run_picard_gc_summary',
  840. 'xcom_pull_files_key':'picard_gc_summary',
  841. 'analysis_tag':'Picard-CollectGcBiasMetrics'})
  842. ## PIPELINE
  843. run_picard_gc_summary >> upload_picard_gc_summary_to_box
  844. ## TASK
  845. run_picard_base_dist_summary = \
  846. PythonOperator(
  847. task_id='run_picard_base_dist_summary',
  848. dag=dag,
  849. queue='hpc_4G',
  850. python_callable=run_picard_for_cellranger,
  851. params={'xcom_pull_files_key':'run_picard_base_dist_summary',
  852. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  853. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  854. 'analysis_description_xcom_key':'analysis_description',
  855. 'use_ephemeral_space':True,
  856. 'load_metrics_to_cram':True,
  857. 'java_param':'-Xmx4g',
  858. 'picard_command':'CollectBaseDistributionByCycle',
  859. 'picard_option':{},
  860. 'analysis_files_xcom_key':'picard_base_summary',
  861. 'bam_files_xcom_key':None})
  862. ## PIPELINE
  863. copy_bam_for_parallel_runs >> run_picard_base_dist_summary
  864. ## TASK
  865. cleanup_picard_base_dist_summary_input = \
  866. PythonOperator(
  867. task_id='cleanup_picard_base_dist_summary_input',
  868. dag=dag,
  869. queue='hpc_4G',
  870. python_callable=clean_up_files,
  871. params={'xcom_pull_files_key':'run_picard_base_dist_summary',
  872. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  873. ## PIPELINE
  874. run_picard_base_dist_summary >> cleanup_picard_base_dist_summary_input
  875. ## TASK
  876. upload_picard_base_dist_summary_to_box = \
  877. PythonOperator(
  878. task_id='upload_picard_base_dist_summary_to_box',
  879. dag=dag,
  880. queue='hpc_4G',
  881. python_callable=upload_analysis_file_to_box,
  882. params={'xcom_pull_task':'run_picard_base_dist_summary',
  883. 'xcom_pull_files_key':'picard_base_summary',
  884. 'analysis_tag':'Picard-CollectBaseDistributionByCycle'})
  885. ## PIPELINE
  886. run_picard_base_dist_summary >> upload_picard_base_dist_summary_to_box
  887. ## TASK
  888. run_samtools_stats = \
  889. PythonOperator(
  890. task_id='run_samtools_stats',
  891. dag=dag,
  892. queue='hpc_4G4t',
  893. python_callable=run_samtools_for_cellranger,
  894. params={'xcom_pull_files_key':'run_samtools_stats',
  895. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  896. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  897. 'analysis_description_xcom_key':'analysis_description',
  898. 'use_ephemeral_space':True,
  899. 'load_metrics_to_cram':True,
  900. 'samtools_command':'stats',
  901. 'threads':4,
  902. 'analysis_files_xcom_key':'samtools_stats'})
  903. ## PIPELINE
  904. copy_bam_for_parallel_runs >> run_samtools_stats
  905. ## TASK
  906. upload_samtools_stats_to_box = \
  907. PythonOperator(
  908. task_id='upload_samtools_stats_to_box',
  909. dag=dag,
  910. queue='hpc_4G',
  911. python_callable=upload_analysis_file_to_box,
  912. params={'xcom_pull_task':'run_samtools_stats',
  913. 'xcom_pull_files_key':'samtools_stats',
  914. 'analysis_tag':'Samtools-stats'})
  915. ## PIPELINE
  916. run_samtools_stats >> upload_samtools_stats_to_box
  917. ## TASK
  918. run_samtools_idxstats = \
  919. PythonOperator(
  920. task_id='run_samtools_idxstats',
  921. dag=dag,
  922. queue='hpc_4G4t',
  923. python_callable=run_samtools_for_cellranger,
  924. params={'xcom_pull_files_key':'run_samtools_stats',
  925. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  926. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  927. 'analysis_description_xcom_key':'analysis_description',
  928. 'use_ephemeral_space':True,
  929. 'load_metrics_to_cram':True,
  930. 'samtools_command':'idxstats',
  931. 'threads':4,
  932. 'analysis_files_xcom_key':'samtools_idxstats'})
  933. ## PIPELINE
  934. run_samtools_stats >> run_samtools_idxstats
  935. ## TASK
  936. cleanup_samtools_stats_input = \
  937. PythonOperator(
  938. task_id='cleanup_samtools_stats_input',
  939. dag=dag,
  940. queue='hpc_4G',
  941. python_callable=clean_up_files,
  942. params={'xcom_pull_files_key':'run_samtools_stats',
  943. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  944. ## PIPELINE
  945. run_samtools_idxstats >> cleanup_samtools_stats_input
  946. ## TASK
  947. upload_samtools_idxstats_to_box = \
  948. PythonOperator(
  949. task_id='upload_samtools_idxstats_to_box',
  950. dag=dag,
  951. queue='hpc_4G',
  952. python_callable=upload_analysis_file_to_box,
  953. params={'xcom_pull_task':'run_samtools_idxstats',
  954. 'xcom_pull_files_key':'samtools_idxstats',
  955. 'analysis_tag':'Samtools-idxstats'})
  956. ## PIPELINE
  957. run_samtools_idxstats >> upload_samtools_idxstats_to_box
  958. ## TASK
  959. run_multiqc = \
  960. PythonOperator(
  961. task_id='run_multiqc',
  962. dag=dag,
  963. queue='hpc_4G',
  964. trigger_rule='none_failed_or_skipped',
  965. python_callable=run_multiqc_for_cellranger,
  966. params={
  967. 'list_of_analysis_xcoms_and_tasks':{
  968. 'run_cellranger':'cellranger_output',
  969. 'run_picard_alignment_summary':'picard_alignment_summary',
  970. 'run_picard_qual_summary':'picard_qual_summary',
  971. 'run_picard_rna_summary':'picard_rna_summary',
  972. 'run_picard_gc_summary':'picard_gc_summary',
  973. 'run_picard_base_dist_summary':'picard_base_summary',
  974. 'run_samtools_stats':'samtools_stats',
  975. 'run_samtools_idxstats':'samtools_idxstats'},
  976. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  977. 'analysis_description_xcom_key':'analysis_description',
  978. 'use_ephemeral_space':True,
  979. 'multiqc_html_file_xcom_key':'multiqc_html',
  980. 'multiqc_data_file_xcom_key':'multiqc_data',
  981. 'tool_order_list':['picad','samtools']})
  982. ## PIPELINE
  983. run_picard_alignment_summary >> run_multiqc
  984. run_picard_qual_summary >> run_multiqc
  985. run_picard_rna_summary >> run_multiqc
  986. run_picard_gc_summary >> run_multiqc
  987. run_picard_base_dist_summary >> run_multiqc
  988. run_samtools_idxstats >> run_multiqc
  989. ## TASK
  990. load_multiqc_html = \
  991. PythonOperator(
  992. task_id='load_multiqc_html',
  993. dag=dag,
  994. queue='hpc_4G',
  995. python_callable=load_analysis_files_func,
  996. params={'collection_name_task':'load_cellranger_result_to_db',
  997. 'collection_name_key':'sample_igf_id',
  998. 'file_name_task':'run_multiqc',
  999. 'file_name_key':'multiqc_html',
  1000. 'analysis_name':'multiqc',
  1001. 'collection_type':'MULTIQC_HTML',
  1002. 'collection_table':'sample',
  1003. 'output_files_key':'output_db_files'})
  1004. ## PIPELINE
  1005. run_multiqc >> load_multiqc_html
  1006. ## TASK
  1007. upload_multiqc_to_ftp = \
  1008. PythonOperator(
  1009. task_id='upload_multiqc_to_ftp',
  1010. dag=dag,
  1011. queue='hpc_4G',
  1012. python_callable=ftp_files_upload_for_analysis,
  1013. params={'xcom_pull_task':'load_multiqc_html',
  1014. 'xcom_pull_files_key':'output_db_files',
  1015. 'collection_name_task':'load_cellranger_result_to_db',
  1016. 'collection_name_key':'sample_igf_id',
  1017. 'collection_type':'FTP_MULTIQC_HTML',
  1018. 'collection_table':'sample',
  1019. 'collect_remote_file':True})
  1020. ## PIPELINE
  1021. load_multiqc_html >> upload_multiqc_to_ftp
  1022. ## TASK
  1023. upload_multiqc_to_box = \
  1024. PythonOperator(
  1025. task_id='upload_multiqc_to_box',
  1026. dag=dag,
  1027. queue='hpc_4G',
  1028. python_callable=upload_analysis_file_to_box,
  1029. params={'xcom_pull_task':'load_multiqc_html',
  1030. 'xcom_pull_files_key':'output_db_files',
  1031. 'analysis_tag':'multiqc_report'})
  1032. ## PIPELINE
  1033. load_multiqc_html >> upload_multiqc_to_box
  1034. ## TASK
  1035. update_analysis_and_status = \
  1036. PythonOperator(
  1037. task_id='update_analysis_and_status',
  1038. dag=dag,
  1039. queue='hpc_4G',
  1040. python_callable=change_pipeline_status,
  1041. trigger_rule='none_failed_or_skipped',
  1042. params={'new_status':'FINISHED',
  1043. 'no_change_status':'SEEDED'})
  1044. ## PIPELINE
  1045. upload_multiqc_to_ftp >> update_analysis_and_status
  1046. upload_scanpy_report_for_sc_5p_to_ftp >> update_analysis_and_status
  1047. upload_scanpy_report_for_sc_5p_to_box >> update_analysis_and_status
  1048. upload_cellbrowser_for_sc_5p_to_ftp >> update_analysis_and_status
  1049. #upload_scirpy_report_for_vdj_to_ftp >> update_analysis_and_status # no more ambiguous VDJ
  1050. #upload_scirpy_report_for_vdj_to_box >> update_analysis_and_status
  1051. upload_scirpy_report_for_vdj_b_to_ftp >> update_analysis_and_status
  1052. upload_scirpy_report_for_vdj_b_to_box >> update_analysis_and_status
  1053. upload_scirpy_report_for_vdj_t_to_ftp >> update_analysis_and_status
  1054. upload_scirpy_report_for_vdj_t_to_box >> update_analysis_and_status
  1055. upload_seurat_report_for_sc_5p_ftp >> update_analysis_and_status
  1056. upload_seurat_report_for_sc_5p_to_box >> update_analysis_and_status
  1057. upload_cellranger_results_to_irods >> update_analysis_and_status
  1058. upload_cellranger_report_to_ftp >> update_analysis_and_status
  1059. upload_cellranger_report_to_box >> update_analysis_and_status
  1060. upload_cram_to_irods >> update_analysis_and_status
  1061. upload_scvelo_report_to_box >> update_analysis_and_status
  1062. upload_scvelo_report_to_ftp >> update_analysis_and_status
  1063. ## TASK
  1064. update_qc_pages = \
  1065. PythonOperator(
  1066. task_id='update_qc_pages',
  1067. dag=dag,
  1068. queue='hpc_4G',
  1069. python_callable=create_and_update_qc_pages,
  1070. params={'use_ephemeral_space':True,
  1071. 'collection_type_list':[
  1072. 'FTP_MULTIQC_HTML',
  1073. 'FTP_SEURAT_HTML',
  1074. 'FTP_SCIRPY_VDJ_T_HTML',
  1075. 'FTP_SCIRPY_VDJ_B_HTML',
  1076. 'FTP_SCIRPY_VDJ_HTML',
  1077. 'FTP_CELLBROWSER',
  1078. 'FTP_SCANPY_HTML',
  1079. 'FTP_SCVELO_HTML',
  1080. 'FTP_CELLRANGER_HTML']})
  1081. ## PIPELINE
  1082. update_analysis_and_status >> update_qc_pages