dag9_tenx_single_cell_immune_profiling.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808
  1. from datetime import timedelta
  2. import os,json,logging,subprocess
  3. from airflow.models import DAG,Variable
  4. from airflow.utils.dates import days_ago
  5. from airflow.operators.python_operator import PythonOperator
  6. from airflow.operators.python_operator import BranchPythonOperator
  7. from airflow.operators.dummy_operator import DummyOperator
  8. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import fetch_analysis_info_and_branch_func
  9. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import configure_cellranger_run_func
  10. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_sc_read_trimmming_func
  11. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_cellranger_tool
  12. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import decide_analysis_branch_func
  13. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_result_to_db_func
  14. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import ftp_files_upload_for_analysis
  15. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import irods_files_upload_for_analysis
  16. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_scanpy_for_sc_5p_func
  17. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_singlecell_notebook_wrapper_func
  18. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_analysis_files_func
  19. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import task_branch_function
  20. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import upload_analysis_file_to_box
  21. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import convert_bam_to_cram_func
  22. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_picard_for_cellranger
  23. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_samtools_for_cellranger
  24. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_multiqc_for_cellranger
  25. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import index_and_copy_bam_for_parallel_analysis
  26. ## ARGS
  27. default_args = {
  28. 'owner': 'airflow',
  29. 'depends_on_past': False,
  30. 'start_date': days_ago(2),
  31. 'email_on_failure': False,
  32. 'email_on_retry': False,
  33. 'retries': 4,
  34. 'max_active_runs':10,
  35. 'catchup':False,
  36. 'retry_delay': timedelta(minutes=5),
  37. 'provide_context': True,
  38. }
  39. FEATURE_TYPE_LIST = Variable.get('tenx_single_cell_immune_profiling_feature_types').split(',')
  40. ## DAG
  41. dag = \
  42. DAG(
  43. dag_id='dag9_tenx_single_cell_immune_profiling',
  44. schedule_interval=None,
  45. tags=['hpc','analysis','tenx','sc'],
  46. default_args=default_args,
  47. orientation='LR')
  48. with dag:
  49. ## TASK
  50. fetch_analysis_info_and_branch = \
  51. BranchPythonOperator(
  52. task_id='fetch_analysis_info',
  53. dag=dag,
  54. queue='hpc_4G',
  55. params={'no_analysis_task':'no_analysis',
  56. 'analysis_description_xcom_key':'analysis_description',
  57. 'analysis_info_xcom_key':'analysis_info'},
  58. python_callable=fetch_analysis_info_and_branch_func)
  59. ## TASK
  60. configure_cellranger_run = \
  61. PythonOperator(
  62. task_id='configure_cellranger_run',
  63. dag=dag,
  64. queue='hpc_4G',
  65. trigger_rule='none_failed_or_skipped',
  66. params={'xcom_pull_task_id':'fetch_analysis_info',
  67. 'analysis_description_xcom_key':'analysis_description',
  68. 'analysis_info_xcom_key':'analysis_info',
  69. 'library_csv_xcom_key':'cellranger_library_csv'},
  70. python_callable=configure_cellranger_run_func)
  71. for analysis_name in FEATURE_TYPE_LIST:
  72. ## TASK
  73. task_branch = \
  74. BranchPythonOperator(
  75. task_id=analysis_name,
  76. dag=dag,
  77. queue='hpc_4G',
  78. params={'xcom_pull_task_id':'fetch_analysis_info',
  79. 'analysis_info_xcom_key':'analysis_info',
  80. 'analysis_name':analysis_name,
  81. 'task_prefix':'run_trim'},
  82. python_callable=task_branch_function)
  83. run_trim_list = list()
  84. for run_id in range(0,10):
  85. ## TASK
  86. t = \
  87. PythonOperator(
  88. task_id='run_trim_{0}_{1}'.format(analysis_name,run_id),
  89. dag=dag,
  90. queue='hpc_4G',
  91. params={'xcom_pull_task_id':'fetch_analysis_info',
  92. 'analysis_info_xcom_key':'analysis_info',
  93. 'analysis_description_xcom_key':'analysis_description',
  94. 'analysis_name':analysis_name,
  95. 'run_id':run_id,
  96. 'r1_length':0,
  97. 'r2_length':0,
  98. 'fastq_input_dir_tag':'fastq_dir',
  99. 'fastq_output_dir_tag':'output_path'},
  100. python_callable=run_sc_read_trimmming_func)
  101. run_trim_list.append(t)
  102. ## TASK
  103. collect_trimmed_files = \
  104. DummyOperator(
  105. task_id='collect_trimmed_files_{0}'.format(analysis_name),
  106. trigger_rule='none_failed_or_skipped',
  107. dag=dag)
  108. ## PIPELINE
  109. fetch_analysis_info_and_branch >> task_branch
  110. task_branch >> run_trim_list
  111. run_trim_list >> collect_trimmed_files
  112. collect_trimmed_files >> configure_cellranger_run
  113. ## TASK
  114. no_analysis = \
  115. DummyOperator(
  116. task_id='no_analysis',
  117. dag=dag)
  118. ## PIPELINE
  119. fetch_analysis_info_and_branch >> no_analysis
  120. ## TASK
  121. run_cellranger = \
  122. PythonOperator(
  123. task_id='run_cellranger',
  124. dag=dag,
  125. queue='hpc_64G16t24hr',
  126. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  127. 'analysis_description_xcom_key':'analysis_description',
  128. 'library_csv_xcom_key':'cellranger_library_csv',
  129. 'library_csv_xcom_pull_task':'configure_cellranger_run',
  130. 'cellranger_xcom_key':'cellranger_output',
  131. 'cellranger_options':['--localcores 16','--localmem 64']},
  132. python_callable=run_cellranger_tool)
  133. ## PIPELINE
  134. configure_cellranger_run >> run_cellranger
  135. ## TASK
  136. decide_analysis_branch = \
  137. BranchPythonOperator(
  138. task_id='decide_analysis_branch',
  139. dag=dag,
  140. queue='hpc_4G',
  141. python_callable=decide_analysis_branch_func,
  142. params={'load_cellranger_result_to_db_task':'load_cellranger_result_to_db',
  143. 'run_scanpy_for_sc_5p_task':'run_scanpy_for_sc_5p',
  144. 'run_scirpy_for_vdj_task':'run_scirpy_for_vdj',
  145. 'run_scirpy_for_vdj_b_task':'run_scirpy_for_vdj_b',
  146. 'run_scirpy_vdj_t_task':'run_scirpy_for_vdj_t',
  147. 'run_seurat_for_sc_5p_task':'run_seurat_for_sc_5p',
  148. 'run_picard_alignment_summary_task':'run_picard_alignment_summary',
  149. 'convert_bam_to_cram_task':'convert_bam_to_cram',
  150. 'library_csv_xcom_key':'cellranger_library_csv',
  151. 'library_csv_xcom_pull_task':'configure_cellranger_run'})
  152. ## PIPELINE
  153. run_cellranger >> decide_analysis_branch
  154. ## TASK
  155. load_cellranger_result_to_db = \
  156. PythonOperator(
  157. task_id='load_cellranger_result_to_db',
  158. dag=dag,
  159. queue='hpc_4G',
  160. python_callable=load_cellranger_result_to_db_func,
  161. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  162. 'analysis_description_xcom_key':'analysis_description',
  163. 'cellranger_xcom_key':'cellranger_output',
  164. 'cellranger_xcom_pull_task':'run_cellranger',
  165. 'collection_type':'CELLRANGER_MULTI',
  166. 'collection_table':'sample',
  167. 'xcom_collection_name_key':'sample_igf_id',
  168. 'genome_column':'genome_build',
  169. 'analysis_name':'cellranger_multi',
  170. 'output_xcom_key':'loaded_output_files',
  171. 'html_xcom_key':'html_report_file',
  172. 'html_report_file_name':'web_summary.html'})
  173. upload_cellranger_report_to_ftp = \
  174. PythonOperator(
  175. task_id='upload_cellranger_report_to_ftp',
  176. dag=dag,
  177. queue='hpc_4G',
  178. python_callable=ftp_files_upload_for_analysis,
  179. params={'xcom_pull_task':'load_cellranger_result_to_db',
  180. 'xcom_pull_files_key':'html_report_file',
  181. 'collection_name_task':'load_cellranger_result_to_db',
  182. 'collection_name_key':'sample_igf_id',
  183. 'collection_type':'FTP_CELLRANGER_MULTI',
  184. 'collection_table':'sample',
  185. 'collect_remote_file':True})
  186. upload_cellranger_report_to_box = \
  187. PythonOperator(
  188. task_id='upload_cellranger_report_to_box',
  189. dag=dag,
  190. queue='hpc_4G',
  191. python_callable=None,
  192. params={'xcom_pull_task':'load_cellranger_result_to_db',
  193. 'xcom_pull_files_key':'html_report_file'})
  194. upload_cellranger_results_to_irods = \
  195. PythonOperator(
  196. task_id='upload_cellranger_results_to_irods',
  197. dag=dag,
  198. queue='hpc_4G',
  199. python_callable=irods_files_upload_for_analysis,
  200. params={'xcom_pull_task':'load_cellranger_result_to_db',
  201. 'xcom_pull_files_key':'loaded_output_files',
  202. 'collection_name_key':'sample_igf_id',
  203. 'analysis_name':'cellranger_multi'})
  204. ## PIPELINE
  205. decide_analysis_branch >> load_cellranger_result_to_db
  206. load_cellranger_result_to_db >> upload_cellranger_report_to_ftp
  207. load_cellranger_result_to_db >> upload_cellranger_report_to_box
  208. load_cellranger_result_to_db >> upload_cellranger_results_to_irods
  209. ## TASK
  210. run_scanpy_for_sc_5p = \
  211. PythonOperator(
  212. task_id='run_scanpy_for_sc_5p',
  213. dag=dag,
  214. queue='hpc_4G',
  215. python_callable=run_singlecell_notebook_wrapper_func,
  216. params={'cellranger_xcom_key':'cellranger_output',
  217. 'cellranger_xcom_pull_task':'run_cellranger',
  218. 'scanpy_timeout':1200,
  219. 'allow_errors':False,
  220. 'kernel_name':'python3',
  221. 'count_dir':'count',
  222. 'analysis_name':'scanpy',
  223. 'output_notebook_key':'scanpy_notebook',
  224. 'output_cellbrowser_key':'cellbrowser_dirs',
  225. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  226. 'analysis_description_xcom_key':'analysis_description'})
  227. load_scanpy_report_for_sc_5p_to_db = \
  228. PythonOperator(
  229. task_id='load_scanpy_report_for_sc_5p_to_db',
  230. dag=dag,
  231. queue='hpc_4G',
  232. python_callable=load_analysis_files_func,
  233. params={'collection_name_task':'load_cellranger_result_to_db',
  234. 'collection_name_key':'sample_igf_id',
  235. 'file_name_task':'run_scanpy_for_sc_5p',
  236. 'file_name_key':'scanpy_notebook',
  237. 'analysis_name':'scanpy_5p',
  238. 'collection_type':'SCANPY_HTML',
  239. 'collection_table':'sample',
  240. 'output_files_key':'output_db_files'})
  241. upload_scanpy_report_for_sc_5p_to_ftp = \
  242. PythonOperator(
  243. task_id='upload_scanpy_report_for_sc_5p_to_ftp',
  244. dag=dag,
  245. queue='hpc_4G',
  246. python_callable=ftp_files_upload_for_analysis,
  247. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  248. 'xcom_pull_files_key':'output_db_files',
  249. 'collection_name_task':'load_cellranger_result_to_db',
  250. 'collection_name_key':'sample_igf_id',
  251. 'collection_type':'FTP_SCANPY_HTML',
  252. 'collection_table':'sample',
  253. 'collect_remote_file':True})
  254. upload_scanpy_report_for_sc_5p_to_box = \
  255. PythonOperator(
  256. task_id='upload_scanpy_report_for_sc_5p_to_box',
  257. dag=dag,
  258. queue='hpc_4G',
  259. python_callable=upload_analysis_file_to_box,
  260. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  261. 'xcom_pull_files_key':'output_db_files',
  262. 'analysis_tag':'scanpy_single_sample_report'})
  263. upload_cellbrowser_for_sc_5p_to_ftp = \
  264. PythonOperator(
  265. task_id='upload_cellbrowser_for_sc_5p_to_ftp',
  266. dag=dag,
  267. queue='hpc_4G',
  268. python_callable=ftp_files_upload_for_analysis,
  269. params={'xcom_pull_task':'run_scanpy_for_sc_5p',
  270. 'xcom_pull_files_key':'cellbrowser_dirs',
  271. 'collection_name_task':'load_cellranger_result_to_db',
  272. 'collection_name_key':'sample_igf_id',
  273. 'collection_type':'FTP_CELLBROWSER',
  274. 'collection_table':'sample',
  275. 'collect_remote_file':True})
  276. ## PIPELINE
  277. decide_analysis_branch >> run_scanpy_for_sc_5p
  278. run_scanpy_for_sc_5p >> load_scanpy_report_for_sc_5p_to_db
  279. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_ftp
  280. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_box
  281. run_scanpy_for_sc_5p >> upload_cellbrowser_for_sc_5p_to_ftp
  282. ## TASK
  283. run_scirpy_for_vdj = \
  284. PythonOperator(
  285. task_id='run_scirpy_for_vdj',
  286. dag=dag,
  287. queue='hpc_4G',
  288. python_callable=run_singlecell_notebook_wrapper_func,
  289. params={'cellranger_xcom_key':'cellranger_output',
  290. 'cellranger_xcom_pull_task':'run_cellranger',
  291. 'scanpy_timeout':1200,
  292. 'allow_errors':False,
  293. 'kernel_name':'python3',
  294. 'analysis_name':'scirpy',
  295. 'vdj_dir':'vdj',
  296. 'count_dir':'count',
  297. 'output_notebook_key':'scirpy_notebook',
  298. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  299. 'analysis_description_xcom_key':'analysis_description'})
  300. load_scirpy_report_for_vdj_to_db = \
  301. PythonOperator(
  302. task_id='load_scirpy_report_for_vdj_to_db',
  303. dag=dag,
  304. queue='hpc_4G',
  305. python_callable=load_analysis_files_func,
  306. params={'collection_name_task':'load_cellranger_result_to_db',
  307. 'collection_name_key':'sample_igf_id',
  308. 'file_name_task':'run_scirpy_for_vdj',
  309. 'file_name_key':'scirpy_notebook',
  310. 'analysis_name':'scirpy_vdj',
  311. 'collection_type':'SCIRPY_VDJ_HTML',
  312. 'collection_table':'sample',
  313. 'output_files_key':'output_db_files'})
  314. upload_scirpy_report_for_vdj_to_ftp = \
  315. PythonOperator(
  316. task_id='upload_scirpy_report_for_vdj_to_ftp',
  317. dag=dag,
  318. queue='hpc_4G',
  319. python_callable=ftp_files_upload_for_analysis,
  320. params={'xcom_pull_task':'load_scanpy_report_for_vdj_to_db',
  321. 'xcom_pull_files_key':'output_db_files',
  322. 'collection_name_task':'load_cellranger_result_to_db',
  323. 'collection_name_key':'sample_igf_id',
  324. 'collection_type':'FTP_SCIRPY_VDJ_HTML',
  325. 'collection_table':'sample',
  326. 'collect_remote_file':True})
  327. upload_scirpy_report_for_vdj_to_box = \
  328. PythonOperator(
  329. task_id='upload_scirpy_report_for_vdj_to_box',
  330. dag=dag,
  331. queue='hpc_4G',
  332. python_callable=upload_analysis_file_to_box,
  333. params={'xcom_pull_task':'load_scanpy_report_for_vdj_to_db',
  334. 'xcom_pull_files_key':'output_db_files',
  335. 'analysis_tag':'scirpy_vdj_single_sample_report'})
  336. ## PIPELINE
  337. decide_analysis_branch >> run_scirpy_for_vdj
  338. run_scirpy_for_vdj >> load_scirpy_report_for_vdj_to_db
  339. load_scirpy_report_for_vdj_to_db >> upload_scirpy_report_for_vdj_to_ftp
  340. load_scirpy_report_for_vdj_to_db >> upload_scirpy_report_for_vdj_to_box
  341. ## TASK
  342. run_scirpy_for_vdj_b = \
  343. PythonOperator(
  344. task_id='run_scirpy_for_vdj_b',
  345. dag=dag,
  346. queue='hpc_4G',
  347. python_callable=run_singlecell_notebook_wrapper_func,
  348. params={'cellranger_xcom_key':'cellranger_output',
  349. 'cellranger_xcom_pull_task':'run_cellranger',
  350. 'scanpy_timeout':1200,
  351. 'allow_errors':False,
  352. 'kernel_name':'python3',
  353. 'analysis_name':'scirpy',
  354. 'vdj_dir':'vdj_b',
  355. 'count_dir':'count',
  356. 'output_notebook_key':'scirpy_notebook',
  357. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  358. 'analysis_description_xcom_key':'analysis_description'})
  359. load_scirpy_report_for_vdj_b_to_db = \
  360. PythonOperator(
  361. task_id='load_scirpy_report_for_vdj_b_to_db',
  362. dag=dag,
  363. queue='hpc_4G',
  364. python_callable=load_analysis_files_func,
  365. params={'collection_name_task':'load_cellranger_result_to_db',
  366. 'collection_name_key':'sample_igf_id',
  367. 'file_name_task':'run_scirpy_for_vdj_b',
  368. 'file_name_key':'scirpy_notebook',
  369. 'analysis_name':'scirpy_vdj_b',
  370. 'collection_type':'SCIRPY_VDJ_B_HTML',
  371. 'collection_table':'sample',
  372. 'output_files_key':'output_db_files'})
  373. upload_scirpy_report_for_vdj_b_to_ftp = \
  374. PythonOperator(
  375. task_id='upload_scirpy_report_for_vdj_b_to_ftp',
  376. dag=dag,
  377. queue='hpc_4G',
  378. python_callable=ftp_files_upload_for_analysis,
  379. params={'xcom_pull_task':'load_scanpy_report_for_vdj_b_to_db',
  380. 'xcom_pull_files_key':'output_db_files',
  381. 'collection_name_task':'load_cellranger_result_to_db',
  382. 'collection_name_key':'sample_igf_id',
  383. 'collection_type':'FTP_SCIRPY_VDJ_B_HTML',
  384. 'collection_table':'sample',
  385. 'collect_remote_file':True})
  386. upload_scirpy_report_for_vdj_b_to_box = \
  387. PythonOperator(
  388. task_id='upload_scanpy_report_for_vdj_b_to_box',
  389. dag=dag,
  390. queue='hpc_4G',
  391. python_callable=upload_analysis_file_to_box,
  392. params={'xcom_pull_task':'load_scanpy_report_for_vdj_b_to_db',
  393. 'xcom_pull_files_key':'output_db_files',
  394. 'analysis_tag':'scirpy_vdj_b_single_sample_report'})
  395. ## PIPELINE
  396. decide_analysis_branch >> run_scirpy_for_vdj_b
  397. run_scirpy_for_vdj_b >> load_scirpy_report_for_vdj_b_to_db
  398. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_ftp
  399. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_box
  400. ## TASK
  401. run_scirpy_for_vdj_t = \
  402. PythonOperator(
  403. task_id='run_scirpy_for_vdj_t',
  404. dag=dag,
  405. queue='hpc_4G',
  406. python_callable=run_singlecell_notebook_wrapper_func,
  407. params={'cellranger_xcom_key':'cellranger_output',
  408. 'cellranger_xcom_pull_task':'run_cellranger',
  409. 'scanpy_timeout':1200,
  410. 'allow_errors':False,
  411. 'kernel_name':'python3',
  412. 'analysis_name':'scirpy',
  413. 'vdj_dir':'vdj_t',
  414. 'count_dir':'count',
  415. 'output_notebook_key':'scirpy_notebook',
  416. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  417. 'analysis_description_xcom_key':'analysis_description'})
  418. load_scirpy_report_for_vdj_t_to_db = \
  419. PythonOperator(
  420. task_id='load_scirpy_report_for_vdj_t_to_db',
  421. dag=dag,
  422. queue='hpc_4G',
  423. python_callable=load_analysis_files_func,
  424. params={'collection_name_task':'load_cellranger_result_to_db',
  425. 'collection_name_key':'sample_igf_id',
  426. 'file_name_task':'run_scirpy_for_vdj_t',
  427. 'file_name_key':'scirpy_notebook',
  428. 'analysis_name':'scirpy_vdj_t',
  429. 'collection_type':'SCIRPY_VDJ_T_HTML',
  430. 'collection_table':'sample',
  431. 'output_files_key':'output_db_files'})
  432. upload_scirpy_report_for_vdj_t_to_ftp = \
  433. PythonOperator(
  434. task_id='upload_scirpy_report_for_vdj_t_to_ftp',
  435. dag=dag,
  436. queue='hpc_4G',
  437. python_callable=ftp_files_upload_for_analysis,
  438. params={'xcom_pull_task':'load_scanpy_report_for_vdj_t_to_db',
  439. 'xcom_pull_files_key':'output_db_files',
  440. 'collection_name_task':'load_cellranger_result_to_db',
  441. 'collection_name_key':'sample_igf_id',
  442. 'collection_type':'FTP_SCIRPY_VDJ_T_HTML',
  443. 'collection_table':'sample',
  444. 'collect_remote_file':True})
  445. upload_scirpy_report_for_vdj_t_to_box = \
  446. PythonOperator(
  447. task_id='upload_scirpy_report_for_vdj_t_to_box',
  448. dag=dag,
  449. queue='hpc_4G',
  450. python_callable=upload_analysis_file_to_box,
  451. params={'xcom_pull_task':'load_scanpy_report_for_vdj_t_to_db',
  452. 'xcom_pull_files_key':'output_db_files',
  453. 'analysis_tag':'scirpy_vdj_t_single_sample_report'})
  454. ## PIPELINE
  455. decide_analysis_branch >> run_scirpy_for_vdj_t
  456. run_scirpy_for_vdj_t >> load_scirpy_report_for_vdj_t_to_db
  457. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_ftp
  458. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_box
  459. ## TASK
  460. run_seurat_for_sc_5p = \
  461. PythonOperator(
  462. task_id='run_seurat_for_sc_5p',
  463. dag=dag,
  464. queue='hpc_4G',
  465. python_callable=run_singlecell_notebook_wrapper_func,
  466. params={'cellranger_xcom_key':'cellranger_output',
  467. 'cellranger_xcom_pull_task':'run_cellranger',
  468. 'scanpy_timeout':1200,
  469. 'allow_errors':False,
  470. 'kernel_name':'R',
  471. 'analysis_name':'seurat',
  472. 'vdj_dir':'vdj',
  473. 'count_dir':'count',
  474. 'output_notebook_key':'seurat_notebook',
  475. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  476. 'analysis_description_xcom_key':'analysis_description'})
  477. load_seurat_report_for_sc_5p_db = \
  478. PythonOperator(
  479. task_id='load_seurat_report_for_sc_5p_db',
  480. dag=dag,
  481. queue='hpc_4G',
  482. python_callable=load_analysis_files_func,
  483. params={'collection_name_task':'load_cellranger_result_to_db',
  484. 'collection_name_key':'sample_igf_id',
  485. 'file_name_task':'run_seurat_for_sc_5p',
  486. 'file_name_key':'seurat_notebook',
  487. 'analysis_name':'seurat_5p',
  488. 'collection_type':'SEURAT_HTML',
  489. 'collection_table':'sample',
  490. 'output_files_key':'output_db_files'})
  491. upload_seurat_report_for_sc_5p_ftp = \
  492. PythonOperator(
  493. task_id='upload_seurat_report_for_sc_5p_ftp',
  494. dag=dag,
  495. queue='hpc_4G',
  496. python_callable=ftp_files_upload_for_analysis,
  497. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  498. 'xcom_pull_files_key':'output_db_files',
  499. 'collection_name_task':'load_cellranger_result_to_db',
  500. 'collection_name_key':'sample_igf_id',
  501. 'collection_type':'FTP_SEURAT_HTML',
  502. 'collection_table':'sample',
  503. 'collect_remote_file':True})
  504. upload_seurat_report_for_sc_5p_to_box = \
  505. PythonOperator(
  506. task_id='upload_seurat_report_for_sc_5p_to_box',
  507. dag=dag,
  508. queue='hpc_4G',
  509. python_callable=upload_analysis_file_to_box,
  510. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  511. 'xcom_pull_files_key':'output_db_files',
  512. 'analysis_tag':'seurat_single_sample_report'})
  513. ## PIPELINE
  514. decide_analysis_branch >> run_seurat_for_sc_5p
  515. run_seurat_for_sc_5p >> load_seurat_report_for_sc_5p_db
  516. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_ftp
  517. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_to_box
  518. ## TASK
  519. copy_bam_for_parallel_runs = \
  520. BranchPythonOperator(
  521. task_id='copy_bam_for_parallel_runs',
  522. dag=dag,
  523. queue='hpc_4G',
  524. python_callable=index_and_copy_bam_for_parallel_analysis,
  525. params={'xcom_pull_files_key':'cellranger_output',
  526. 'xcom_pull_task':'run_cellranger',
  527. 'list_of_tasks':[
  528. 'convert_cellranger_bam_to_cram',
  529. 'run_picard_alignment_summary',
  530. 'run_picard_qual_summary',
  531. 'run_picard_rna_summary',
  532. 'run_picard_gc_summary',
  533. 'run_picard_base_dist_summary',
  534. 'run_samtools_stats']
  535. })
  536. ## TASK
  537. convert_bam_to_cram = \
  538. PythonOperator(
  539. task_id='convert_cellranger_bam_to_cram',
  540. dag=dag,
  541. queue='hpc_4G4t',
  542. python_callable=convert_bam_to_cram_func,
  543. params={'xcom_pull_files_key':'convert_cellranger_bam_to_cram',
  544. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  545. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  546. 'analysis_description_xcom_key':'analysis_description',
  547. 'use_ephemeral_space':True,
  548. 'threads':4,
  549. 'analysis_name':'cellranger',
  550. 'collection_type':'ALIGNED_CRAM',
  551. 'collection_table':'sample',
  552. 'cram_files_xcom_key':'cram_files'})
  553. upload_cram_to_irods = \
  554. PythonOperator(
  555. task_id='upload_cram_to_irods',
  556. dag=dag,
  557. queue='hpc_4G',
  558. python_callable=irods_files_upload_for_analysis,
  559. params={'xcom_pull_task':'convert_cellranger_bam_to_cram',
  560. 'xcom_pull_files_key':'cram_files',
  561. 'collection_name_key':'sample_igf_id',
  562. 'analysis_name':'cellranger_multi'})
  563. ## PIPELINE
  564. decide_analysis_branch >> copy_bam_for_parallel_runs
  565. copy_bam_for_parallel_runs >> convert_bam_to_cram
  566. convert_bam_to_cram >> upload_cram_to_irods
  567. ## TASK
  568. run_picard_alignment_summary = \
  569. PythonOperator(
  570. task_id='run_picard_alignment_summary',
  571. dag=dag,
  572. queue='hpc_4G',
  573. python_callable=run_picard_for_cellranger,
  574. params={'xcom_pull_files_key':'cellranger_output',
  575. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  576. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  577. 'analysis_description_xcom_key':'analysis_description',
  578. 'use_ephemeral_space':True,
  579. 'load_metrics_to_cram':True,
  580. 'java_param':'-Xmx4g',
  581. 'picard_command':'CollectAlignmentSummaryMetrics',
  582. 'picard_option':{},
  583. 'analysis_files_xcom_key':'picard_alignment_summary',
  584. 'bam_files_xcom_key':None})
  585. ## PIPELINE
  586. copy_bam_for_parallel_runs >> run_picard_alignment_summary
  587. ## TASK
  588. run_picard_qual_summary = \
  589. PythonOperator(
  590. task_id='run_picard_qual_summary',
  591. dag=dag,
  592. queue='hpc_4G',
  593. python_callable=run_picard_for_cellranger,
  594. params={'xcom_pull_files_key':'cellranger_output',
  595. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  596. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  597. 'analysis_description_xcom_key':'analysis_description',
  598. 'use_ephemeral_space':True,
  599. 'load_metrics_to_cram':True,
  600. 'java_param':'-Xmx4g',
  601. 'picard_command':'QualityScoreDistribution',
  602. 'picard_option':{},
  603. 'analysis_files_xcom_key':'picard_qual_summary',
  604. 'bam_files_xcom_key':None})
  605. ## PIPELINE
  606. copy_bam_for_parallel_runs >> run_picard_qual_summary
  607. ## TASK
  608. run_picard_rna_summary = \
  609. PythonOperator(
  610. task_id='run_picard_rna_summary',
  611. dag=dag,
  612. queue='hpc_4G',
  613. python_callable=run_picard_for_cellranger,
  614. params={'xcom_pull_files_key':'cellranger_output',
  615. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  616. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  617. 'analysis_description_xcom_key':'analysis_description',
  618. 'use_ephemeral_space':True,
  619. 'load_metrics_to_cram':True,
  620. 'java_param':'-Xmx4g',
  621. 'picard_command':'CollectRnaSeqMetrics',
  622. 'picard_option':{},
  623. 'analysis_files_xcom_key':'picard_rna_summary',
  624. 'bam_files_xcom_key':None})
  625. ## PIPELINE
  626. copy_bam_for_parallel_runs >> run_picard_rna_summary
  627. ## TASK
  628. run_picard_gc_summary = \
  629. PythonOperator(
  630. task_id='run_picard_gc_summary',
  631. dag=dag,
  632. queue='hpc_4G',
  633. python_callable=run_picard_for_cellranger,
  634. params={'xcom_pull_files_key':'cellranger_output',
  635. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  636. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  637. 'analysis_description_xcom_key':'analysis_description',
  638. 'use_ephemeral_space':True,
  639. 'load_metrics_to_cram':True,
  640. 'java_param':'-Xmx4g',
  641. 'picard_command':'CollectGcBiasMetrics',
  642. 'picard_option':{},
  643. 'analysis_files_xcom_key':'picard_gc_summary',
  644. 'bam_files_xcom_key':None})
  645. ## PIPELINE
  646. copy_bam_for_parallel_runs >> run_picard_gc_summary
  647. ## TASK
  648. run_picard_base_dist_summary = \
  649. PythonOperator(
  650. task_id='run_picard_base_dist_summary',
  651. dag=dag,
  652. queue='hpc_4G',
  653. python_callable=run_picard_for_cellranger,
  654. params={'xcom_pull_files_key':'run_picard_base_dist_summary',
  655. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  656. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  657. 'analysis_description_xcom_key':'analysis_description',
  658. 'use_ephemeral_space':True,
  659. 'load_metrics_to_cram':True,
  660. 'java_param':'-Xmx4g',
  661. 'picard_command':'CollectBaseDistributionByCycle',
  662. 'picard_option':{},
  663. 'analysis_files_xcom_key':'picard_base_summary',
  664. 'bam_files_xcom_key':None})
  665. ## PIPELINE
  666. copy_bam_for_parallel_runs >> run_picard_base_dist_summary
  667. ## TASK
  668. run_samtools_stats = \
  669. PythonOperator(
  670. task_id='run_samtools_stats',
  671. dag=dag,
  672. queue='hpc_4G4t',
  673. python_callable=run_samtools_for_cellranger,
  674. params={'xcom_pull_files_key':'run_samtools_stats',
  675. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  676. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  677. 'analysis_description_xcom_key':'analysis_description',
  678. 'use_ephemeral_space':True,
  679. 'load_metrics_to_cram':True,
  680. 'samtools_command':'stats',
  681. 'threads':4,
  682. 'analysis_files_xcom_key':'samtools_stats'})
  683. ## PIPELINE
  684. copy_bam_for_parallel_runs >> run_samtools_stats
  685. ## TASK
  686. run_samtools_idxstats = \
  687. PythonOperator(
  688. task_id='run_samtools_idxstats',
  689. dag=dag,
  690. queue='hpc_4G4t',
  691. python_callable=run_samtools_for_cellranger,
  692. params={'xcom_pull_files_key':'run_samtools_stats',
  693. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  694. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  695. 'analysis_description_xcom_key':'analysis_description',
  696. 'use_ephemeral_space':True,
  697. 'load_metrics_to_cram':True,
  698. 'samtools_command':'idxstats',
  699. 'threads':4,
  700. 'analysis_files_xcom_key':'samtools_idxstats'})
  701. ## PIPELINE
  702. run_samtools_stats >> run_samtools_idxstats
  703. ## TASK
  704. run_multiqc = \
  705. PythonOperator(
  706. task_id='run_multiqc',
  707. dag=dag,
  708. queue='hpc_4G',
  709. trigger_rule='none_failed_or_skipped',
  710. python_callable=run_multiqc_for_cellranger,
  711. params={
  712. 'list_of_analysis_xcoms_and_tasks':{
  713. 'run_cellranger':'cellranger_output',
  714. 'run_picard_alignment_summary':'picard_alignment_summary',
  715. 'run_picard_qual_summary':'picard_qual_summary',
  716. 'run_picard_rna_summary':'picard_rna_summary',
  717. 'run_picard_gc_summary':'picard_gc_summary',
  718. 'run_picard_base_dist_summary':'picard_base_summary',
  719. 'run_samtools_stats':'samtools_stats',
  720. 'run_samtools_idxstats':'samtools_idxstats'},
  721. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  722. 'analysis_description_xcom_key':'analysis_description',
  723. 'use_ephemeral_space':True,
  724. 'multiqc_html_file_xcom_key':'multiqc_html',
  725. 'multiqc_data_file_xcom_key':'multiqc_data',
  726. 'tool_order_list':['picad','samtools']})
  727. ## PIPELINE
  728. run_picard_alignment_summary >> run_multiqc
  729. run_picard_qual_summary >> run_multiqc
  730. run_picard_rna_summary >> run_multiqc
  731. run_picard_gc_summary >> run_multiqc
  732. run_picard_base_dist_summary >> run_multiqc
  733. run_samtools_idxstats >> run_multiqc
  734. ## TASK
  735. load_multiqc_html = \
  736. PythonOperator(
  737. task_id='load_multiqc_html',
  738. dag=dag,
  739. queue='hpc_4G',
  740. python_callable=load_analysis_files_func,
  741. params={'collection_name_task':'load_cellranger_result_to_db',
  742. 'collection_name_key':'sample_igf_id',
  743. 'file_name_task':'run_multiqc',
  744. 'file_name_key':'multiqc_html',
  745. 'analysis_name':'multiqc',
  746. 'collection_type':'MULTIQC_HTML',
  747. 'collection_table':'sample',
  748. 'output_files_key':'output_db_files'})
  749. ## PIPELINE
  750. run_multiqc >> load_multiqc_html
  751. ## TASK
  752. upload_multiqc_to_ftp = \
  753. PythonOperator(
  754. task_id='upload_multiqc_to_ftp',
  755. dag=dag,
  756. queue='hpc_4G',
  757. python_callable=ftp_files_upload_for_analysis,
  758. params={'xcom_pull_task':'load_multiqc_html',
  759. 'xcom_pull_files_key':'output_db_files',
  760. 'collection_name_task':'load_cellranger_result_to_db',
  761. 'collection_name_key':'sample_igf_id',
  762. 'collection_type':'FTP_MULTIQC_HTML',
  763. 'collection_table':'sample',
  764. 'collect_remote_file':True})
  765. ## PIPELINE
  766. load_multiqc_html >> upload_multiqc_to_ftp
  767. ## TASK
  768. upload_multiqc_to_box = \
  769. PythonOperator(
  770. task_id='upload_multiqc_to_box',
  771. dag=dag,
  772. queue='hpc_4G',
  773. python_callable=upload_analysis_file_to_box,
  774. params={'xcom_pull_task':'load_multiqc_html',
  775. 'xcom_pull_files_key':'output_db_files',
  776. 'analysis_tag':'multiqc_report'})
  777. ## PIPELINE
  778. load_multiqc_html >> upload_multiqc_to_box
  779. ## TASK
  780. update_analysis_and_status = \
  781. PythonOperator(
  782. task_id='update_analysis_and_status',
  783. dag=dag,
  784. queue='hpc_4G',
  785. python_callable=None,
  786. trigger_rule='none_failed_or_skipped')
  787. ## PIPELINE
  788. upload_multiqc_to_ftp >> update_analysis_and_status
  789. upload_scanpy_report_for_sc_5p_to_ftp >> update_analysis_and_status
  790. upload_scanpy_report_for_sc_5p_to_box >> update_analysis_and_status
  791. upload_cellbrowser_for_sc_5p_to_ftp >> update_analysis_and_status
  792. upload_scirpy_report_for_vdj_to_ftp >> update_analysis_and_status
  793. upload_scirpy_report_for_vdj_to_box >> update_analysis_and_status
  794. upload_scirpy_report_for_vdj_b_to_ftp >> update_analysis_and_status
  795. upload_scirpy_report_for_vdj_b_to_box >> update_analysis_and_status
  796. upload_scirpy_report_for_vdj_t_to_ftp >> update_analysis_and_status
  797. upload_scirpy_report_for_vdj_t_to_box >> update_analysis_and_status
  798. upload_seurat_report_for_sc_5p_ftp >> update_analysis_and_status
  799. upload_seurat_report_for_sc_5p_to_box >> update_analysis_and_status
  800. upload_cellranger_results_to_irods >> update_analysis_and_status
  801. upload_cellranger_report_to_ftp >> update_analysis_and_status
  802. upload_cellranger_report_to_box >> update_analysis_and_status
  803. upload_cram_to_irods >> update_analysis_and_status