dag9_tenx_single_cell_immune_profiling.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809
  1. from datetime import timedelta
  2. import os,json,logging,subprocess
  3. from airflow.models import DAG,Variable
  4. from airflow.utils.dates import days_ago
  5. from airflow.operators.python_operator import PythonOperator
  6. from airflow.operators.python_operator import BranchPythonOperator
  7. from airflow.operators.dummy_operator import DummyOperator
  8. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import fetch_analysis_info_and_branch_func
  9. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import configure_cellranger_run_func
  10. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_sc_read_trimmming_func
  11. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_cellranger_tool
  12. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import decide_analysis_branch_func
  13. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_result_to_db_func
  14. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import ftp_files_upload_for_analysis
  15. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import irods_files_upload_for_analysis
  16. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_scanpy_for_sc_5p_func
  17. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_singlecell_notebook_wrapper_func
  18. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_analysis_files_func
  19. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import task_branch_function
  20. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import upload_analysis_file_to_box
  21. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import convert_bam_to_cram_func
  22. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_picard_for_cellranger
  23. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_samtools_for_cellranger
  24. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_multiqc_for_cellranger
  25. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import index_and_copy_bam_for_parallel_analysis
  26. ## ARGS
  27. default_args = {
  28. 'owner': 'airflow',
  29. 'depends_on_past': False,
  30. 'start_date': days_ago(2),
  31. 'email_on_failure': False,
  32. 'email_on_retry': False,
  33. 'retries': 4,
  34. 'max_active_runs':10,
  35. 'catchup':False,
  36. 'retry_delay': timedelta(minutes=5),
  37. 'provide_context': True,
  38. }
  39. FEATURE_TYPE_LIST = \
  40. Variable.get('tenx_single_cell_immune_profiling_feature_types',deserialize_json=True)#.split(',')
  41. ## DAG
  42. dag = \
  43. DAG(
  44. dag_id='dag9_tenx_single_cell_immune_profiling',
  45. schedule_interval=None,
  46. tags=['hpc','analysis','tenx','sc'],
  47. default_args=default_args,
  48. orientation='LR')
  49. with dag:
  50. ## TASK
  51. fetch_analysis_info_and_branch = \
  52. BranchPythonOperator(
  53. task_id='fetch_analysis_info',
  54. dag=dag,
  55. queue='hpc_4G',
  56. params={'no_analysis_task':'no_analysis',
  57. 'analysis_description_xcom_key':'analysis_description',
  58. 'analysis_info_xcom_key':'analysis_info'},
  59. python_callable=fetch_analysis_info_and_branch_func)
  60. ## TASK
  61. configure_cellranger_run = \
  62. PythonOperator(
  63. task_id='configure_cellranger_run',
  64. dag=dag,
  65. queue='hpc_4G',
  66. trigger_rule='none_failed_or_skipped',
  67. params={'xcom_pull_task_id':'fetch_analysis_info',
  68. 'analysis_description_xcom_key':'analysis_description',
  69. 'analysis_info_xcom_key':'analysis_info',
  70. 'library_csv_xcom_key':'cellranger_library_csv'},
  71. python_callable=configure_cellranger_run_func)
  72. for analysis_name in FEATURE_TYPE_LIST:
  73. ## TASK
  74. task_branch = \
  75. BranchPythonOperator(
  76. task_id=analysis_name,
  77. dag=dag,
  78. queue='hpc_4G',
  79. params={'xcom_pull_task_id':'fetch_analysis_info',
  80. 'analysis_info_xcom_key':'analysis_info',
  81. 'analysis_name':analysis_name,
  82. 'task_prefix':'run_trim'},
  83. python_callable=task_branch_function)
  84. run_trim_list = list()
  85. for run_id in range(0,10):
  86. ## TASK
  87. t = \
  88. PythonOperator(
  89. task_id='run_trim_{0}_{1}'.format(analysis_name,run_id),
  90. dag=dag,
  91. queue='hpc_4G',
  92. params={'xcom_pull_task_id':'fetch_analysis_info',
  93. 'analysis_info_xcom_key':'analysis_info',
  94. 'analysis_description_xcom_key':'analysis_description',
  95. 'analysis_name':analysis_name,
  96. 'run_id':run_id,
  97. 'r1_length':0,
  98. 'r2_length':0,
  99. 'fastq_input_dir_tag':'fastq_dir',
  100. 'fastq_output_dir_tag':'output_path'},
  101. python_callable=run_sc_read_trimmming_func)
  102. run_trim_list.append(t)
  103. ## TASK
  104. collect_trimmed_files = \
  105. DummyOperator(
  106. task_id='collect_trimmed_files_{0}'.format(analysis_name),
  107. trigger_rule='none_failed_or_skipped',
  108. dag=dag)
  109. ## PIPELINE
  110. fetch_analysis_info_and_branch >> task_branch
  111. task_branch >> run_trim_list
  112. run_trim_list >> collect_trimmed_files
  113. collect_trimmed_files >> configure_cellranger_run
  114. ## TASK
  115. no_analysis = \
  116. DummyOperator(
  117. task_id='no_analysis',
  118. dag=dag)
  119. ## PIPELINE
  120. fetch_analysis_info_and_branch >> no_analysis
  121. ## TASK
  122. run_cellranger = \
  123. PythonOperator(
  124. task_id='run_cellranger',
  125. dag=dag,
  126. queue='hpc_64G16t24hr',
  127. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  128. 'analysis_description_xcom_key':'analysis_description',
  129. 'library_csv_xcom_key':'cellranger_library_csv',
  130. 'library_csv_xcom_pull_task':'configure_cellranger_run',
  131. 'cellranger_xcom_key':'cellranger_output',
  132. 'cellranger_options':['--localcores 16','--localmem 64']},
  133. python_callable=run_cellranger_tool)
  134. ## PIPELINE
  135. configure_cellranger_run >> run_cellranger
  136. ## TASK
  137. decide_analysis_branch = \
  138. BranchPythonOperator(
  139. task_id='decide_analysis_branch',
  140. dag=dag,
  141. queue='hpc_4G',
  142. python_callable=decide_analysis_branch_func,
  143. params={'load_cellranger_result_to_db_task':'load_cellranger_result_to_db',
  144. 'run_scanpy_for_sc_5p_task':'run_scanpy_for_sc_5p',
  145. 'run_scirpy_for_vdj_task':'run_scirpy_for_vdj',
  146. 'run_scirpy_for_vdj_b_task':'run_scirpy_for_vdj_b',
  147. 'run_scirpy_vdj_t_task':'run_scirpy_for_vdj_t',
  148. 'run_seurat_for_sc_5p_task':'run_seurat_for_sc_5p',
  149. 'run_picard_alignment_summary_task':'run_picard_alignment_summary',
  150. 'convert_bam_to_cram_task':'convert_bam_to_cram',
  151. 'library_csv_xcom_key':'cellranger_library_csv',
  152. 'library_csv_xcom_pull_task':'configure_cellranger_run'})
  153. ## PIPELINE
  154. run_cellranger >> decide_analysis_branch
  155. ## TASK
  156. load_cellranger_result_to_db = \
  157. PythonOperator(
  158. task_id='load_cellranger_result_to_db',
  159. dag=dag,
  160. queue='hpc_4G',
  161. python_callable=load_cellranger_result_to_db_func,
  162. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  163. 'analysis_description_xcom_key':'analysis_description',
  164. 'cellranger_xcom_key':'cellranger_output',
  165. 'cellranger_xcom_pull_task':'run_cellranger',
  166. 'collection_type':'CELLRANGER_MULTI',
  167. 'collection_table':'sample',
  168. 'xcom_collection_name_key':'sample_igf_id',
  169. 'genome_column':'genome_build',
  170. 'analysis_name':'cellranger_multi',
  171. 'output_xcom_key':'loaded_output_files',
  172. 'html_xcom_key':'html_report_file',
  173. 'html_report_file_name':'web_summary.html'})
  174. upload_cellranger_report_to_ftp = \
  175. PythonOperator(
  176. task_id='upload_cellranger_report_to_ftp',
  177. dag=dag,
  178. queue='hpc_4G',
  179. python_callable=ftp_files_upload_for_analysis,
  180. params={'xcom_pull_task':'load_cellranger_result_to_db',
  181. 'xcom_pull_files_key':'html_report_file',
  182. 'collection_name_task':'load_cellranger_result_to_db',
  183. 'collection_name_key':'sample_igf_id',
  184. 'collection_type':'FTP_CELLRANGER_MULTI',
  185. 'collection_table':'sample',
  186. 'collect_remote_file':True})
  187. upload_cellranger_report_to_box = \
  188. PythonOperator(
  189. task_id='upload_cellranger_report_to_box',
  190. dag=dag,
  191. queue='hpc_4G',
  192. python_callable=None,
  193. params={'xcom_pull_task':'load_cellranger_result_to_db',
  194. 'xcom_pull_files_key':'html_report_file'})
  195. upload_cellranger_results_to_irods = \
  196. PythonOperator(
  197. task_id='upload_cellranger_results_to_irods',
  198. dag=dag,
  199. queue='hpc_4G',
  200. python_callable=irods_files_upload_for_analysis,
  201. params={'xcom_pull_task':'load_cellranger_result_to_db',
  202. 'xcom_pull_files_key':'loaded_output_files',
  203. 'collection_name_key':'sample_igf_id',
  204. 'analysis_name':'cellranger_multi'})
  205. ## PIPELINE
  206. decide_analysis_branch >> load_cellranger_result_to_db
  207. load_cellranger_result_to_db >> upload_cellranger_report_to_ftp
  208. load_cellranger_result_to_db >> upload_cellranger_report_to_box
  209. load_cellranger_result_to_db >> upload_cellranger_results_to_irods
  210. ## TASK
  211. run_scanpy_for_sc_5p = \
  212. PythonOperator(
  213. task_id='run_scanpy_for_sc_5p',
  214. dag=dag,
  215. queue='hpc_4G',
  216. python_callable=run_singlecell_notebook_wrapper_func,
  217. params={'cellranger_xcom_key':'cellranger_output',
  218. 'cellranger_xcom_pull_task':'run_cellranger',
  219. 'scanpy_timeout':1200,
  220. 'allow_errors':False,
  221. 'kernel_name':'python3',
  222. 'count_dir':'count',
  223. 'analysis_name':'scanpy',
  224. 'output_notebook_key':'scanpy_notebook',
  225. 'output_cellbrowser_key':'cellbrowser_dirs',
  226. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  227. 'analysis_description_xcom_key':'analysis_description'})
  228. load_scanpy_report_for_sc_5p_to_db = \
  229. PythonOperator(
  230. task_id='load_scanpy_report_for_sc_5p_to_db',
  231. dag=dag,
  232. queue='hpc_4G',
  233. python_callable=load_analysis_files_func,
  234. params={'collection_name_task':'load_cellranger_result_to_db',
  235. 'collection_name_key':'sample_igf_id',
  236. 'file_name_task':'run_scanpy_for_sc_5p',
  237. 'file_name_key':'scanpy_notebook',
  238. 'analysis_name':'scanpy_5p',
  239. 'collection_type':'SCANPY_HTML',
  240. 'collection_table':'sample',
  241. 'output_files_key':'output_db_files'})
  242. upload_scanpy_report_for_sc_5p_to_ftp = \
  243. PythonOperator(
  244. task_id='upload_scanpy_report_for_sc_5p_to_ftp',
  245. dag=dag,
  246. queue='hpc_4G',
  247. python_callable=ftp_files_upload_for_analysis,
  248. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  249. 'xcom_pull_files_key':'output_db_files',
  250. 'collection_name_task':'load_cellranger_result_to_db',
  251. 'collection_name_key':'sample_igf_id',
  252. 'collection_type':'FTP_SCANPY_HTML',
  253. 'collection_table':'sample',
  254. 'collect_remote_file':True})
  255. upload_scanpy_report_for_sc_5p_to_box = \
  256. PythonOperator(
  257. task_id='upload_scanpy_report_for_sc_5p_to_box',
  258. dag=dag,
  259. queue='hpc_4G',
  260. python_callable=upload_analysis_file_to_box,
  261. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  262. 'xcom_pull_files_key':'output_db_files',
  263. 'analysis_tag':'scanpy_single_sample_report'})
  264. upload_cellbrowser_for_sc_5p_to_ftp = \
  265. PythonOperator(
  266. task_id='upload_cellbrowser_for_sc_5p_to_ftp',
  267. dag=dag,
  268. queue='hpc_4G',
  269. python_callable=ftp_files_upload_for_analysis,
  270. params={'xcom_pull_task':'run_scanpy_for_sc_5p',
  271. 'xcom_pull_files_key':'cellbrowser_dirs',
  272. 'collection_name_task':'load_cellranger_result_to_db',
  273. 'collection_name_key':'sample_igf_id',
  274. 'collection_type':'FTP_CELLBROWSER',
  275. 'collection_table':'sample',
  276. 'collect_remote_file':True})
  277. ## PIPELINE
  278. decide_analysis_branch >> run_scanpy_for_sc_5p
  279. run_scanpy_for_sc_5p >> load_scanpy_report_for_sc_5p_to_db
  280. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_ftp
  281. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_box
  282. run_scanpy_for_sc_5p >> upload_cellbrowser_for_sc_5p_to_ftp
  283. ## TASK
  284. run_scirpy_for_vdj = \
  285. PythonOperator(
  286. task_id='run_scirpy_for_vdj',
  287. dag=dag,
  288. queue='hpc_4G',
  289. python_callable=run_singlecell_notebook_wrapper_func,
  290. params={'cellranger_xcom_key':'cellranger_output',
  291. 'cellranger_xcom_pull_task':'run_cellranger',
  292. 'scanpy_timeout':1200,
  293. 'allow_errors':False,
  294. 'kernel_name':'python3',
  295. 'analysis_name':'scirpy',
  296. 'vdj_dir':'vdj',
  297. 'count_dir':'count',
  298. 'output_notebook_key':'scirpy_notebook',
  299. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  300. 'analysis_description_xcom_key':'analysis_description'})
  301. load_scirpy_report_for_vdj_to_db = \
  302. PythonOperator(
  303. task_id='load_scirpy_report_for_vdj_to_db',
  304. dag=dag,
  305. queue='hpc_4G',
  306. python_callable=load_analysis_files_func,
  307. params={'collection_name_task':'load_cellranger_result_to_db',
  308. 'collection_name_key':'sample_igf_id',
  309. 'file_name_task':'run_scirpy_for_vdj',
  310. 'file_name_key':'scirpy_notebook',
  311. 'analysis_name':'scirpy_vdj',
  312. 'collection_type':'SCIRPY_VDJ_HTML',
  313. 'collection_table':'sample',
  314. 'output_files_key':'output_db_files'})
  315. upload_scirpy_report_for_vdj_to_ftp = \
  316. PythonOperator(
  317. task_id='upload_scirpy_report_for_vdj_to_ftp',
  318. dag=dag,
  319. queue='hpc_4G',
  320. python_callable=ftp_files_upload_for_analysis,
  321. params={'xcom_pull_task':'load_scanpy_report_for_vdj_to_db',
  322. 'xcom_pull_files_key':'output_db_files',
  323. 'collection_name_task':'load_cellranger_result_to_db',
  324. 'collection_name_key':'sample_igf_id',
  325. 'collection_type':'FTP_SCIRPY_VDJ_HTML',
  326. 'collection_table':'sample',
  327. 'collect_remote_file':True})
  328. upload_scirpy_report_for_vdj_to_box = \
  329. PythonOperator(
  330. task_id='upload_scirpy_report_for_vdj_to_box',
  331. dag=dag,
  332. queue='hpc_4G',
  333. python_callable=upload_analysis_file_to_box,
  334. params={'xcom_pull_task':'load_scanpy_report_for_vdj_to_db',
  335. 'xcom_pull_files_key':'output_db_files',
  336. 'analysis_tag':'scirpy_vdj_single_sample_report'})
  337. ## PIPELINE
  338. decide_analysis_branch >> run_scirpy_for_vdj
  339. run_scirpy_for_vdj >> load_scirpy_report_for_vdj_to_db
  340. load_scirpy_report_for_vdj_to_db >> upload_scirpy_report_for_vdj_to_ftp
  341. load_scirpy_report_for_vdj_to_db >> upload_scirpy_report_for_vdj_to_box
  342. ## TASK
  343. run_scirpy_for_vdj_b = \
  344. PythonOperator(
  345. task_id='run_scirpy_for_vdj_b',
  346. dag=dag,
  347. queue='hpc_4G',
  348. python_callable=run_singlecell_notebook_wrapper_func,
  349. params={'cellranger_xcom_key':'cellranger_output',
  350. 'cellranger_xcom_pull_task':'run_cellranger',
  351. 'scanpy_timeout':1200,
  352. 'allow_errors':False,
  353. 'kernel_name':'python3',
  354. 'analysis_name':'scirpy',
  355. 'vdj_dir':'vdj_b',
  356. 'count_dir':'count',
  357. 'output_notebook_key':'scirpy_notebook',
  358. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  359. 'analysis_description_xcom_key':'analysis_description'})
  360. load_scirpy_report_for_vdj_b_to_db = \
  361. PythonOperator(
  362. task_id='load_scirpy_report_for_vdj_b_to_db',
  363. dag=dag,
  364. queue='hpc_4G',
  365. python_callable=load_analysis_files_func,
  366. params={'collection_name_task':'load_cellranger_result_to_db',
  367. 'collection_name_key':'sample_igf_id',
  368. 'file_name_task':'run_scirpy_for_vdj_b',
  369. 'file_name_key':'scirpy_notebook',
  370. 'analysis_name':'scirpy_vdj_b',
  371. 'collection_type':'SCIRPY_VDJ_B_HTML',
  372. 'collection_table':'sample',
  373. 'output_files_key':'output_db_files'})
  374. upload_scirpy_report_for_vdj_b_to_ftp = \
  375. PythonOperator(
  376. task_id='upload_scirpy_report_for_vdj_b_to_ftp',
  377. dag=dag,
  378. queue='hpc_4G',
  379. python_callable=ftp_files_upload_for_analysis,
  380. params={'xcom_pull_task':'load_scanpy_report_for_vdj_b_to_db',
  381. 'xcom_pull_files_key':'output_db_files',
  382. 'collection_name_task':'load_cellranger_result_to_db',
  383. 'collection_name_key':'sample_igf_id',
  384. 'collection_type':'FTP_SCIRPY_VDJ_B_HTML',
  385. 'collection_table':'sample',
  386. 'collect_remote_file':True})
  387. upload_scirpy_report_for_vdj_b_to_box = \
  388. PythonOperator(
  389. task_id='upload_scanpy_report_for_vdj_b_to_box',
  390. dag=dag,
  391. queue='hpc_4G',
  392. python_callable=upload_analysis_file_to_box,
  393. params={'xcom_pull_task':'load_scanpy_report_for_vdj_b_to_db',
  394. 'xcom_pull_files_key':'output_db_files',
  395. 'analysis_tag':'scirpy_vdj_b_single_sample_report'})
  396. ## PIPELINE
  397. decide_analysis_branch >> run_scirpy_for_vdj_b
  398. run_scirpy_for_vdj_b >> load_scirpy_report_for_vdj_b_to_db
  399. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_ftp
  400. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_box
  401. ## TASK
  402. run_scirpy_for_vdj_t = \
  403. PythonOperator(
  404. task_id='run_scirpy_for_vdj_t',
  405. dag=dag,
  406. queue='hpc_4G',
  407. python_callable=run_singlecell_notebook_wrapper_func,
  408. params={'cellranger_xcom_key':'cellranger_output',
  409. 'cellranger_xcom_pull_task':'run_cellranger',
  410. 'scanpy_timeout':1200,
  411. 'allow_errors':False,
  412. 'kernel_name':'python3',
  413. 'analysis_name':'scirpy',
  414. 'vdj_dir':'vdj_t',
  415. 'count_dir':'count',
  416. 'output_notebook_key':'scirpy_notebook',
  417. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  418. 'analysis_description_xcom_key':'analysis_description'})
  419. load_scirpy_report_for_vdj_t_to_db = \
  420. PythonOperator(
  421. task_id='load_scirpy_report_for_vdj_t_to_db',
  422. dag=dag,
  423. queue='hpc_4G',
  424. python_callable=load_analysis_files_func,
  425. params={'collection_name_task':'load_cellranger_result_to_db',
  426. 'collection_name_key':'sample_igf_id',
  427. 'file_name_task':'run_scirpy_for_vdj_t',
  428. 'file_name_key':'scirpy_notebook',
  429. 'analysis_name':'scirpy_vdj_t',
  430. 'collection_type':'SCIRPY_VDJ_T_HTML',
  431. 'collection_table':'sample',
  432. 'output_files_key':'output_db_files'})
  433. upload_scirpy_report_for_vdj_t_to_ftp = \
  434. PythonOperator(
  435. task_id='upload_scirpy_report_for_vdj_t_to_ftp',
  436. dag=dag,
  437. queue='hpc_4G',
  438. python_callable=ftp_files_upload_for_analysis,
  439. params={'xcom_pull_task':'load_scanpy_report_for_vdj_t_to_db',
  440. 'xcom_pull_files_key':'output_db_files',
  441. 'collection_name_task':'load_cellranger_result_to_db',
  442. 'collection_name_key':'sample_igf_id',
  443. 'collection_type':'FTP_SCIRPY_VDJ_T_HTML',
  444. 'collection_table':'sample',
  445. 'collect_remote_file':True})
  446. upload_scirpy_report_for_vdj_t_to_box = \
  447. PythonOperator(
  448. task_id='upload_scirpy_report_for_vdj_t_to_box',
  449. dag=dag,
  450. queue='hpc_4G',
  451. python_callable=upload_analysis_file_to_box,
  452. params={'xcom_pull_task':'load_scanpy_report_for_vdj_t_to_db',
  453. 'xcom_pull_files_key':'output_db_files',
  454. 'analysis_tag':'scirpy_vdj_t_single_sample_report'})
  455. ## PIPELINE
  456. decide_analysis_branch >> run_scirpy_for_vdj_t
  457. run_scirpy_for_vdj_t >> load_scirpy_report_for_vdj_t_to_db
  458. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_ftp
  459. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_box
  460. ## TASK
  461. run_seurat_for_sc_5p = \
  462. PythonOperator(
  463. task_id='run_seurat_for_sc_5p',
  464. dag=dag,
  465. queue='hpc_4G',
  466. python_callable=run_singlecell_notebook_wrapper_func,
  467. params={'cellranger_xcom_key':'cellranger_output',
  468. 'cellranger_xcom_pull_task':'run_cellranger',
  469. 'scanpy_timeout':1200,
  470. 'allow_errors':False,
  471. 'kernel_name':'R',
  472. 'analysis_name':'seurat',
  473. 'vdj_dir':'vdj',
  474. 'count_dir':'count',
  475. 'output_notebook_key':'seurat_notebook',
  476. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  477. 'analysis_description_xcom_key':'analysis_description'})
  478. load_seurat_report_for_sc_5p_db = \
  479. PythonOperator(
  480. task_id='load_seurat_report_for_sc_5p_db',
  481. dag=dag,
  482. queue='hpc_4G',
  483. python_callable=load_analysis_files_func,
  484. params={'collection_name_task':'load_cellranger_result_to_db',
  485. 'collection_name_key':'sample_igf_id',
  486. 'file_name_task':'run_seurat_for_sc_5p',
  487. 'file_name_key':'seurat_notebook',
  488. 'analysis_name':'seurat_5p',
  489. 'collection_type':'SEURAT_HTML',
  490. 'collection_table':'sample',
  491. 'output_files_key':'output_db_files'})
  492. upload_seurat_report_for_sc_5p_ftp = \
  493. PythonOperator(
  494. task_id='upload_seurat_report_for_sc_5p_ftp',
  495. dag=dag,
  496. queue='hpc_4G',
  497. python_callable=ftp_files_upload_for_analysis,
  498. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  499. 'xcom_pull_files_key':'output_db_files',
  500. 'collection_name_task':'load_cellranger_result_to_db',
  501. 'collection_name_key':'sample_igf_id',
  502. 'collection_type':'FTP_SEURAT_HTML',
  503. 'collection_table':'sample',
  504. 'collect_remote_file':True})
  505. upload_seurat_report_for_sc_5p_to_box = \
  506. PythonOperator(
  507. task_id='upload_seurat_report_for_sc_5p_to_box',
  508. dag=dag,
  509. queue='hpc_4G',
  510. python_callable=upload_analysis_file_to_box,
  511. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  512. 'xcom_pull_files_key':'output_db_files',
  513. 'analysis_tag':'seurat_single_sample_report'})
  514. ## PIPELINE
  515. decide_analysis_branch >> run_seurat_for_sc_5p
  516. run_seurat_for_sc_5p >> load_seurat_report_for_sc_5p_db
  517. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_ftp
  518. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_to_box
  519. ## TASK
  520. copy_bam_for_parallel_runs = \
  521. BranchPythonOperator(
  522. task_id='copy_bam_for_parallel_runs',
  523. dag=dag,
  524. queue='hpc_4G',
  525. python_callable=index_and_copy_bam_for_parallel_analysis,
  526. params={'xcom_pull_files_key':'cellranger_output',
  527. 'xcom_pull_task':'run_cellranger',
  528. 'list_of_tasks':[
  529. 'convert_cellranger_bam_to_cram',
  530. 'run_picard_alignment_summary',
  531. 'run_picard_qual_summary',
  532. 'run_picard_rna_summary',
  533. 'run_picard_gc_summary',
  534. 'run_picard_base_dist_summary',
  535. 'run_samtools_stats']
  536. })
  537. ## TASK
  538. convert_bam_to_cram = \
  539. PythonOperator(
  540. task_id='convert_cellranger_bam_to_cram',
  541. dag=dag,
  542. queue='hpc_4G4t',
  543. python_callable=convert_bam_to_cram_func,
  544. params={'xcom_pull_files_key':'convert_cellranger_bam_to_cram',
  545. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  546. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  547. 'analysis_description_xcom_key':'analysis_description',
  548. 'use_ephemeral_space':True,
  549. 'threads':4,
  550. 'analysis_name':'cellranger',
  551. 'collection_type':'ALIGNED_CRAM',
  552. 'collection_table':'sample',
  553. 'cram_files_xcom_key':'cram_files'})
  554. upload_cram_to_irods = \
  555. PythonOperator(
  556. task_id='upload_cram_to_irods',
  557. dag=dag,
  558. queue='hpc_4G',
  559. python_callable=irods_files_upload_for_analysis,
  560. params={'xcom_pull_task':'convert_cellranger_bam_to_cram',
  561. 'xcom_pull_files_key':'cram_files',
  562. 'collection_name_key':'sample_igf_id',
  563. 'analysis_name':'cellranger_multi'})
  564. ## PIPELINE
  565. decide_analysis_branch >> copy_bam_for_parallel_runs
  566. copy_bam_for_parallel_runs >> convert_bam_to_cram
  567. convert_bam_to_cram >> upload_cram_to_irods
  568. ## TASK
  569. run_picard_alignment_summary = \
  570. PythonOperator(
  571. task_id='run_picard_alignment_summary',
  572. dag=dag,
  573. queue='hpc_4G',
  574. python_callable=run_picard_for_cellranger,
  575. params={'xcom_pull_files_key':'cellranger_output',
  576. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  577. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  578. 'analysis_description_xcom_key':'analysis_description',
  579. 'use_ephemeral_space':True,
  580. 'load_metrics_to_cram':True,
  581. 'java_param':'-Xmx4g',
  582. 'picard_command':'CollectAlignmentSummaryMetrics',
  583. 'picard_option':{},
  584. 'analysis_files_xcom_key':'picard_alignment_summary',
  585. 'bam_files_xcom_key':None})
  586. ## PIPELINE
  587. copy_bam_for_parallel_runs >> run_picard_alignment_summary
  588. ## TASK
  589. run_picard_qual_summary = \
  590. PythonOperator(
  591. task_id='run_picard_qual_summary',
  592. dag=dag,
  593. queue='hpc_4G',
  594. python_callable=run_picard_for_cellranger,
  595. params={'xcom_pull_files_key':'cellranger_output',
  596. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  597. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  598. 'analysis_description_xcom_key':'analysis_description',
  599. 'use_ephemeral_space':True,
  600. 'load_metrics_to_cram':True,
  601. 'java_param':'-Xmx4g',
  602. 'picard_command':'QualityScoreDistribution',
  603. 'picard_option':{},
  604. 'analysis_files_xcom_key':'picard_qual_summary',
  605. 'bam_files_xcom_key':None})
  606. ## PIPELINE
  607. copy_bam_for_parallel_runs >> run_picard_qual_summary
  608. ## TASK
  609. run_picard_rna_summary = \
  610. PythonOperator(
  611. task_id='run_picard_rna_summary',
  612. dag=dag,
  613. queue='hpc_4G',
  614. python_callable=run_picard_for_cellranger,
  615. params={'xcom_pull_files_key':'cellranger_output',
  616. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  617. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  618. 'analysis_description_xcom_key':'analysis_description',
  619. 'use_ephemeral_space':True,
  620. 'load_metrics_to_cram':True,
  621. 'java_param':'-Xmx4g',
  622. 'picard_command':'CollectRnaSeqMetrics',
  623. 'picard_option':{},
  624. 'analysis_files_xcom_key':'picard_rna_summary',
  625. 'bam_files_xcom_key':None})
  626. ## PIPELINE
  627. copy_bam_for_parallel_runs >> run_picard_rna_summary
  628. ## TASK
  629. run_picard_gc_summary = \
  630. PythonOperator(
  631. task_id='run_picard_gc_summary',
  632. dag=dag,
  633. queue='hpc_4G',
  634. python_callable=run_picard_for_cellranger,
  635. params={'xcom_pull_files_key':'cellranger_output',
  636. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  637. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  638. 'analysis_description_xcom_key':'analysis_description',
  639. 'use_ephemeral_space':True,
  640. 'load_metrics_to_cram':True,
  641. 'java_param':'-Xmx4g',
  642. 'picard_command':'CollectGcBiasMetrics',
  643. 'picard_option':{},
  644. 'analysis_files_xcom_key':'picard_gc_summary',
  645. 'bam_files_xcom_key':None})
  646. ## PIPELINE
  647. copy_bam_for_parallel_runs >> run_picard_gc_summary
  648. ## TASK
  649. run_picard_base_dist_summary = \
  650. PythonOperator(
  651. task_id='run_picard_base_dist_summary',
  652. dag=dag,
  653. queue='hpc_4G',
  654. python_callable=run_picard_for_cellranger,
  655. params={'xcom_pull_files_key':'run_picard_base_dist_summary',
  656. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  657. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  658. 'analysis_description_xcom_key':'analysis_description',
  659. 'use_ephemeral_space':True,
  660. 'load_metrics_to_cram':True,
  661. 'java_param':'-Xmx4g',
  662. 'picard_command':'CollectBaseDistributionByCycle',
  663. 'picard_option':{},
  664. 'analysis_files_xcom_key':'picard_base_summary',
  665. 'bam_files_xcom_key':None})
  666. ## PIPELINE
  667. copy_bam_for_parallel_runs >> run_picard_base_dist_summary
  668. ## TASK
  669. run_samtools_stats = \
  670. PythonOperator(
  671. task_id='run_samtools_stats',
  672. dag=dag,
  673. queue='hpc_4G4t',
  674. python_callable=run_samtools_for_cellranger,
  675. params={'xcom_pull_files_key':'run_samtools_stats',
  676. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  677. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  678. 'analysis_description_xcom_key':'analysis_description',
  679. 'use_ephemeral_space':True,
  680. 'load_metrics_to_cram':True,
  681. 'samtools_command':'stats',
  682. 'threads':4,
  683. 'analysis_files_xcom_key':'samtools_stats'})
  684. ## PIPELINE
  685. copy_bam_for_parallel_runs >> run_samtools_stats
  686. ## TASK
  687. run_samtools_idxstats = \
  688. PythonOperator(
  689. task_id='run_samtools_idxstats',
  690. dag=dag,
  691. queue='hpc_4G4t',
  692. python_callable=run_samtools_for_cellranger,
  693. params={'xcom_pull_files_key':'run_samtools_stats',
  694. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  695. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  696. 'analysis_description_xcom_key':'analysis_description',
  697. 'use_ephemeral_space':True,
  698. 'load_metrics_to_cram':True,
  699. 'samtools_command':'idxstats',
  700. 'threads':4,
  701. 'analysis_files_xcom_key':'samtools_idxstats'})
  702. ## PIPELINE
  703. run_samtools_stats >> run_samtools_idxstats
  704. ## TASK
  705. run_multiqc = \
  706. PythonOperator(
  707. task_id='run_multiqc',
  708. dag=dag,
  709. queue='hpc_4G',
  710. trigger_rule='none_failed_or_skipped',
  711. python_callable=run_multiqc_for_cellranger,
  712. params={
  713. 'list_of_analysis_xcoms_and_tasks':{
  714. 'run_cellranger':'cellranger_output',
  715. 'run_picard_alignment_summary':'picard_alignment_summary',
  716. 'run_picard_qual_summary':'picard_qual_summary',
  717. 'run_picard_rna_summary':'picard_rna_summary',
  718. 'run_picard_gc_summary':'picard_gc_summary',
  719. 'run_picard_base_dist_summary':'picard_base_summary',
  720. 'run_samtools_stats':'samtools_stats',
  721. 'run_samtools_idxstats':'samtools_idxstats'},
  722. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  723. 'analysis_description_xcom_key':'analysis_description',
  724. 'use_ephemeral_space':True,
  725. 'multiqc_html_file_xcom_key':'multiqc_html',
  726. 'multiqc_data_file_xcom_key':'multiqc_data',
  727. 'tool_order_list':['picad','samtools']})
  728. ## PIPELINE
  729. run_picard_alignment_summary >> run_multiqc
  730. run_picard_qual_summary >> run_multiqc
  731. run_picard_rna_summary >> run_multiqc
  732. run_picard_gc_summary >> run_multiqc
  733. run_picard_base_dist_summary >> run_multiqc
  734. run_samtools_idxstats >> run_multiqc
  735. ## TASK
  736. load_multiqc_html = \
  737. PythonOperator(
  738. task_id='load_multiqc_html',
  739. dag=dag,
  740. queue='hpc_4G',
  741. python_callable=load_analysis_files_func,
  742. params={'collection_name_task':'load_cellranger_result_to_db',
  743. 'collection_name_key':'sample_igf_id',
  744. 'file_name_task':'run_multiqc',
  745. 'file_name_key':'multiqc_html',
  746. 'analysis_name':'multiqc',
  747. 'collection_type':'MULTIQC_HTML',
  748. 'collection_table':'sample',
  749. 'output_files_key':'output_db_files'})
  750. ## PIPELINE
  751. run_multiqc >> load_multiqc_html
  752. ## TASK
  753. upload_multiqc_to_ftp = \
  754. PythonOperator(
  755. task_id='upload_multiqc_to_ftp',
  756. dag=dag,
  757. queue='hpc_4G',
  758. python_callable=ftp_files_upload_for_analysis,
  759. params={'xcom_pull_task':'load_multiqc_html',
  760. 'xcom_pull_files_key':'output_db_files',
  761. 'collection_name_task':'load_cellranger_result_to_db',
  762. 'collection_name_key':'sample_igf_id',
  763. 'collection_type':'FTP_MULTIQC_HTML',
  764. 'collection_table':'sample',
  765. 'collect_remote_file':True})
  766. ## PIPELINE
  767. load_multiqc_html >> upload_multiqc_to_ftp
  768. ## TASK
  769. upload_multiqc_to_box = \
  770. PythonOperator(
  771. task_id='upload_multiqc_to_box',
  772. dag=dag,
  773. queue='hpc_4G',
  774. python_callable=upload_analysis_file_to_box,
  775. params={'xcom_pull_task':'load_multiqc_html',
  776. 'xcom_pull_files_key':'output_db_files',
  777. 'analysis_tag':'multiqc_report'})
  778. ## PIPELINE
  779. load_multiqc_html >> upload_multiqc_to_box
  780. ## TASK
  781. update_analysis_and_status = \
  782. PythonOperator(
  783. task_id='update_analysis_and_status',
  784. dag=dag,
  785. queue='hpc_4G',
  786. python_callable=None,
  787. trigger_rule='none_failed_or_skipped')
  788. ## PIPELINE
  789. upload_multiqc_to_ftp >> update_analysis_and_status
  790. upload_scanpy_report_for_sc_5p_to_ftp >> update_analysis_and_status
  791. upload_scanpy_report_for_sc_5p_to_box >> update_analysis_and_status
  792. upload_cellbrowser_for_sc_5p_to_ftp >> update_analysis_and_status
  793. upload_scirpy_report_for_vdj_to_ftp >> update_analysis_and_status
  794. upload_scirpy_report_for_vdj_to_box >> update_analysis_and_status
  795. upload_scirpy_report_for_vdj_b_to_ftp >> update_analysis_and_status
  796. upload_scirpy_report_for_vdj_b_to_box >> update_analysis_and_status
  797. upload_scirpy_report_for_vdj_t_to_ftp >> update_analysis_and_status
  798. upload_scirpy_report_for_vdj_t_to_box >> update_analysis_and_status
  799. upload_seurat_report_for_sc_5p_ftp >> update_analysis_and_status
  800. upload_seurat_report_for_sc_5p_to_box >> update_analysis_and_status
  801. upload_cellranger_results_to_irods >> update_analysis_and_status
  802. upload_cellranger_report_to_ftp >> update_analysis_and_status
  803. upload_cellranger_report_to_box >> update_analysis_and_status
  804. upload_cram_to_irods >> update_analysis_and_status