dag9_tenx_single_cell_immune_profiling.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811
  1. from datetime import timedelta
  2. import os,json,logging,subprocess
  3. from airflow.models import DAG,Variable
  4. from airflow.utils.dates import days_ago
  5. from airflow.operators.python_operator import PythonOperator
  6. from airflow.operators.python_operator import BranchPythonOperator
  7. from airflow.operators.dummy_operator import DummyOperator
  8. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import fetch_analysis_info_and_branch_func
  9. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import configure_cellranger_run_func
  10. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_sc_read_trimmming_func
  11. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_cellranger_tool
  12. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import decide_analysis_branch_func
  13. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_result_to_db_func
  14. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import ftp_files_upload_for_analysis
  15. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import irods_files_upload_for_analysis
  16. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_scanpy_for_sc_5p_func
  17. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_singlecell_notebook_wrapper_func
  18. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_analysis_files_func
  19. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import task_branch_function
  20. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import upload_analysis_file_to_box
  21. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import convert_bam_to_cram_func
  22. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_picard_for_cellranger
  23. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_samtools_for_cellranger
  24. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_multiqc_for_cellranger
  25. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import index_and_copy_bam_for_parallel_analysis
  26. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import change_pipeline_status
  27. ## ARGS
  28. default_args = {
  29. 'owner': 'airflow',
  30. 'depends_on_past': False,
  31. 'start_date': days_ago(2),
  32. 'email_on_failure': False,
  33. 'email_on_retry': False,
  34. 'retries': 4,
  35. 'max_active_runs':10,
  36. 'catchup':False,
  37. 'retry_delay': timedelta(minutes=5),
  38. 'provide_context': True,
  39. }
  40. FEATURE_TYPE_LIST = \
  41. Variable.get('tenx_single_cell_immune_profiling_feature_types',deserialize_json=True)#.split(',')
  42. ## DAG
  43. dag = \
  44. DAG(
  45. dag_id='dag9_tenx_single_cell_immune_profiling',
  46. schedule_interval=None,
  47. tags=['hpc','analysis','tenx','sc'],
  48. default_args=default_args,
  49. orientation='LR')
  50. with dag:
  51. ## TASK
  52. fetch_analysis_info_and_branch = \
  53. BranchPythonOperator(
  54. task_id='fetch_analysis_info',
  55. dag=dag,
  56. queue='hpc_4G',
  57. params={'no_analysis_task':'no_analysis',
  58. 'analysis_description_xcom_key':'analysis_description',
  59. 'analysis_info_xcom_key':'analysis_info'},
  60. python_callable=fetch_analysis_info_and_branch_func)
  61. ## TASK
  62. configure_cellranger_run = \
  63. PythonOperator(
  64. task_id='configure_cellranger_run',
  65. dag=dag,
  66. queue='hpc_4G',
  67. trigger_rule='none_failed_or_skipped',
  68. params={'xcom_pull_task_id':'fetch_analysis_info',
  69. 'analysis_description_xcom_key':'analysis_description',
  70. 'analysis_info_xcom_key':'analysis_info',
  71. 'library_csv_xcom_key':'cellranger_library_csv'},
  72. python_callable=configure_cellranger_run_func)
  73. for analysis_name in FEATURE_TYPE_LIST.keys():
  74. ## TASK
  75. task_branch = \
  76. BranchPythonOperator(
  77. task_id=analysis_name,
  78. dag=dag,
  79. queue='hpc_4G',
  80. params={'xcom_pull_task_id':'fetch_analysis_info',
  81. 'analysis_info_xcom_key':'analysis_info',
  82. 'analysis_name':analysis_name,
  83. 'task_prefix':'run_trim'},
  84. python_callable=task_branch_function)
  85. run_trim_list = list()
  86. for run_id in range(0,10):
  87. ## TASK
  88. t = \
  89. PythonOperator(
  90. task_id='run_trim_{0}_{1}'.format(analysis_name,run_id),
  91. dag=dag,
  92. queue='hpc_4G',
  93. params={'xcom_pull_task_id':'fetch_analysis_info',
  94. 'analysis_info_xcom_key':'analysis_info',
  95. 'analysis_description_xcom_key':'analysis_description',
  96. 'analysis_name':analysis_name,
  97. 'run_id':run_id,
  98. 'r1_length':0,
  99. 'r2_length':0,
  100. 'fastq_input_dir_tag':'fastq_dir',
  101. 'fastq_output_dir_tag':'output_path'},
  102. python_callable=run_sc_read_trimmming_func)
  103. run_trim_list.append(t)
  104. ## TASK
  105. collect_trimmed_files = \
  106. DummyOperator(
  107. task_id='collect_trimmed_files_{0}'.format(analysis_name),
  108. trigger_rule='none_failed_or_skipped',
  109. dag=dag)
  110. ## PIPELINE
  111. fetch_analysis_info_and_branch >> task_branch
  112. task_branch >> run_trim_list
  113. run_trim_list >> collect_trimmed_files
  114. collect_trimmed_files >> configure_cellranger_run
  115. ## TASK
  116. no_analysis = \
  117. DummyOperator(
  118. task_id='no_analysis',
  119. dag=dag)
  120. ## PIPELINE
  121. fetch_analysis_info_and_branch >> no_analysis
  122. ## TASK
  123. run_cellranger = \
  124. PythonOperator(
  125. task_id='run_cellranger',
  126. dag=dag,
  127. queue='hpc_64G16t24hr',
  128. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  129. 'analysis_description_xcom_key':'analysis_description',
  130. 'library_csv_xcom_key':'cellranger_library_csv',
  131. 'library_csv_xcom_pull_task':'configure_cellranger_run',
  132. 'cellranger_xcom_key':'cellranger_output',
  133. 'cellranger_options':['--localcores 16','--localmem 64']},
  134. python_callable=run_cellranger_tool)
  135. ## PIPELINE
  136. configure_cellranger_run >> run_cellranger
  137. ## TASK
  138. decide_analysis_branch = \
  139. BranchPythonOperator(
  140. task_id='decide_analysis_branch',
  141. dag=dag,
  142. queue='hpc_4G',
  143. python_callable=decide_analysis_branch_func,
  144. params={'load_cellranger_result_to_db_task':'load_cellranger_result_to_db',
  145. 'run_scanpy_for_sc_5p_task':'run_scanpy_for_sc_5p',
  146. 'run_scirpy_for_vdj_task':'run_scirpy_for_vdj',
  147. 'run_scirpy_for_vdj_b_task':'run_scirpy_for_vdj_b',
  148. 'run_scirpy_vdj_t_task':'run_scirpy_for_vdj_t',
  149. 'run_seurat_for_sc_5p_task':'run_seurat_for_sc_5p',
  150. 'run_picard_alignment_summary_task':'run_picard_alignment_summary',
  151. 'convert_bam_to_cram_task':'convert_bam_to_cram',
  152. 'library_csv_xcom_key':'cellranger_library_csv',
  153. 'library_csv_xcom_pull_task':'configure_cellranger_run'})
  154. ## PIPELINE
  155. run_cellranger >> decide_analysis_branch
  156. ## TASK
  157. load_cellranger_result_to_db = \
  158. PythonOperator(
  159. task_id='load_cellranger_result_to_db',
  160. dag=dag,
  161. queue='hpc_4G',
  162. python_callable=load_cellranger_result_to_db_func,
  163. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  164. 'analysis_description_xcom_key':'analysis_description',
  165. 'cellranger_xcom_key':'cellranger_output',
  166. 'cellranger_xcom_pull_task':'run_cellranger',
  167. 'collection_type':'CELLRANGER_MULTI',
  168. 'collection_table':'sample',
  169. 'xcom_collection_name_key':'sample_igf_id',
  170. 'genome_column':'genome_build',
  171. 'analysis_name':'cellranger_multi',
  172. 'output_xcom_key':'loaded_output_files',
  173. 'html_xcom_key':'html_report_file',
  174. 'html_report_file_name':'web_summary.html'})
  175. upload_cellranger_report_to_ftp = \
  176. PythonOperator(
  177. task_id='upload_cellranger_report_to_ftp',
  178. dag=dag,
  179. queue='hpc_4G',
  180. python_callable=ftp_files_upload_for_analysis,
  181. params={'xcom_pull_task':'load_cellranger_result_to_db',
  182. 'xcom_pull_files_key':'html_report_file',
  183. 'collection_name_task':'load_cellranger_result_to_db',
  184. 'collection_name_key':'sample_igf_id',
  185. 'collection_type':'FTP_CELLRANGER_MULTI',
  186. 'collection_table':'sample',
  187. 'collect_remote_file':True})
  188. upload_cellranger_report_to_box = \
  189. PythonOperator(
  190. task_id='upload_cellranger_report_to_box',
  191. dag=dag,
  192. queue='hpc_4G',
  193. python_callable=None,
  194. params={'xcom_pull_task':'load_cellranger_result_to_db',
  195. 'xcom_pull_files_key':'html_report_file'})
  196. upload_cellranger_results_to_irods = \
  197. PythonOperator(
  198. task_id='upload_cellranger_results_to_irods',
  199. dag=dag,
  200. queue='hpc_4G',
  201. python_callable=irods_files_upload_for_analysis,
  202. params={'xcom_pull_task':'load_cellranger_result_to_db',
  203. 'xcom_pull_files_key':'loaded_output_files',
  204. 'collection_name_key':'sample_igf_id',
  205. 'analysis_name':'cellranger_multi'})
  206. ## PIPELINE
  207. decide_analysis_branch >> load_cellranger_result_to_db
  208. load_cellranger_result_to_db >> upload_cellranger_report_to_ftp
  209. load_cellranger_result_to_db >> upload_cellranger_report_to_box
  210. load_cellranger_result_to_db >> upload_cellranger_results_to_irods
  211. ## TASK
  212. run_scanpy_for_sc_5p = \
  213. PythonOperator(
  214. task_id='run_scanpy_for_sc_5p',
  215. dag=dag,
  216. queue='hpc_4G',
  217. python_callable=run_singlecell_notebook_wrapper_func,
  218. params={'cellranger_xcom_key':'cellranger_output',
  219. 'cellranger_xcom_pull_task':'run_cellranger',
  220. 'scanpy_timeout':1200,
  221. 'allow_errors':False,
  222. 'kernel_name':'python3',
  223. 'count_dir':'count',
  224. 'analysis_name':'scanpy',
  225. 'output_notebook_key':'scanpy_notebook',
  226. 'output_cellbrowser_key':'cellbrowser_dirs',
  227. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  228. 'analysis_description_xcom_key':'analysis_description'})
  229. load_scanpy_report_for_sc_5p_to_db = \
  230. PythonOperator(
  231. task_id='load_scanpy_report_for_sc_5p_to_db',
  232. dag=dag,
  233. queue='hpc_4G',
  234. python_callable=load_analysis_files_func,
  235. params={'collection_name_task':'load_cellranger_result_to_db',
  236. 'collection_name_key':'sample_igf_id',
  237. 'file_name_task':'run_scanpy_for_sc_5p',
  238. 'file_name_key':'scanpy_notebook',
  239. 'analysis_name':'scanpy_5p',
  240. 'collection_type':'SCANPY_HTML',
  241. 'collection_table':'sample',
  242. 'output_files_key':'output_db_files'})
  243. upload_scanpy_report_for_sc_5p_to_ftp = \
  244. PythonOperator(
  245. task_id='upload_scanpy_report_for_sc_5p_to_ftp',
  246. dag=dag,
  247. queue='hpc_4G',
  248. python_callable=ftp_files_upload_for_analysis,
  249. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  250. 'xcom_pull_files_key':'output_db_files',
  251. 'collection_name_task':'load_cellranger_result_to_db',
  252. 'collection_name_key':'sample_igf_id',
  253. 'collection_type':'FTP_SCANPY_HTML',
  254. 'collection_table':'sample',
  255. 'collect_remote_file':True})
  256. upload_scanpy_report_for_sc_5p_to_box = \
  257. PythonOperator(
  258. task_id='upload_scanpy_report_for_sc_5p_to_box',
  259. dag=dag,
  260. queue='hpc_4G',
  261. python_callable=upload_analysis_file_to_box,
  262. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  263. 'xcom_pull_files_key':'output_db_files',
  264. 'analysis_tag':'scanpy_single_sample_report'})
  265. upload_cellbrowser_for_sc_5p_to_ftp = \
  266. PythonOperator(
  267. task_id='upload_cellbrowser_for_sc_5p_to_ftp',
  268. dag=dag,
  269. queue='hpc_4G',
  270. python_callable=ftp_files_upload_for_analysis,
  271. params={'xcom_pull_task':'run_scanpy_for_sc_5p',
  272. 'xcom_pull_files_key':'cellbrowser_dirs',
  273. 'collection_name_task':'load_cellranger_result_to_db',
  274. 'collection_name_key':'sample_igf_id',
  275. 'collection_type':'FTP_CELLBROWSER',
  276. 'collection_table':'sample',
  277. 'collect_remote_file':True})
  278. ## PIPELINE
  279. decide_analysis_branch >> run_scanpy_for_sc_5p
  280. run_scanpy_for_sc_5p >> load_scanpy_report_for_sc_5p_to_db
  281. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_ftp
  282. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_box
  283. run_scanpy_for_sc_5p >> upload_cellbrowser_for_sc_5p_to_ftp
  284. ## TASK
  285. run_scirpy_for_vdj = \
  286. PythonOperator(
  287. task_id='run_scirpy_for_vdj',
  288. dag=dag,
  289. queue='hpc_4G',
  290. python_callable=run_singlecell_notebook_wrapper_func,
  291. params={'cellranger_xcom_key':'cellranger_output',
  292. 'cellranger_xcom_pull_task':'run_cellranger',
  293. 'scanpy_timeout':1200,
  294. 'allow_errors':False,
  295. 'kernel_name':'python3',
  296. 'analysis_name':'scirpy',
  297. 'vdj_dir':'vdj',
  298. 'count_dir':'count',
  299. 'output_notebook_key':'scirpy_notebook',
  300. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  301. 'analysis_description_xcom_key':'analysis_description'})
  302. load_scirpy_report_for_vdj_to_db = \
  303. PythonOperator(
  304. task_id='load_scirpy_report_for_vdj_to_db',
  305. dag=dag,
  306. queue='hpc_4G',
  307. python_callable=load_analysis_files_func,
  308. params={'collection_name_task':'load_cellranger_result_to_db',
  309. 'collection_name_key':'sample_igf_id',
  310. 'file_name_task':'run_scirpy_for_vdj',
  311. 'file_name_key':'scirpy_notebook',
  312. 'analysis_name':'scirpy_vdj',
  313. 'collection_type':'SCIRPY_VDJ_HTML',
  314. 'collection_table':'sample',
  315. 'output_files_key':'output_db_files'})
  316. upload_scirpy_report_for_vdj_to_ftp = \
  317. PythonOperator(
  318. task_id='upload_scirpy_report_for_vdj_to_ftp',
  319. dag=dag,
  320. queue='hpc_4G',
  321. python_callable=ftp_files_upload_for_analysis,
  322. params={'xcom_pull_task':'load_scanpy_report_for_vdj_to_db',
  323. 'xcom_pull_files_key':'output_db_files',
  324. 'collection_name_task':'load_cellranger_result_to_db',
  325. 'collection_name_key':'sample_igf_id',
  326. 'collection_type':'FTP_SCIRPY_VDJ_HTML',
  327. 'collection_table':'sample',
  328. 'collect_remote_file':True})
  329. upload_scirpy_report_for_vdj_to_box = \
  330. PythonOperator(
  331. task_id='upload_scirpy_report_for_vdj_to_box',
  332. dag=dag,
  333. queue='hpc_4G',
  334. python_callable=upload_analysis_file_to_box,
  335. params={'xcom_pull_task':'load_scanpy_report_for_vdj_to_db',
  336. 'xcom_pull_files_key':'output_db_files',
  337. 'analysis_tag':'scirpy_vdj_single_sample_report'})
  338. ## PIPELINE
  339. decide_analysis_branch >> run_scirpy_for_vdj
  340. run_scirpy_for_vdj >> load_scirpy_report_for_vdj_to_db
  341. load_scirpy_report_for_vdj_to_db >> upload_scirpy_report_for_vdj_to_ftp
  342. load_scirpy_report_for_vdj_to_db >> upload_scirpy_report_for_vdj_to_box
  343. ## TASK
  344. run_scirpy_for_vdj_b = \
  345. PythonOperator(
  346. task_id='run_scirpy_for_vdj_b',
  347. dag=dag,
  348. queue='hpc_4G',
  349. python_callable=run_singlecell_notebook_wrapper_func,
  350. params={'cellranger_xcom_key':'cellranger_output',
  351. 'cellranger_xcom_pull_task':'run_cellranger',
  352. 'scanpy_timeout':1200,
  353. 'allow_errors':False,
  354. 'kernel_name':'python3',
  355. 'analysis_name':'scirpy',
  356. 'vdj_dir':'vdj_b',
  357. 'count_dir':'count',
  358. 'output_notebook_key':'scirpy_notebook',
  359. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  360. 'analysis_description_xcom_key':'analysis_description'})
  361. load_scirpy_report_for_vdj_b_to_db = \
  362. PythonOperator(
  363. task_id='load_scirpy_report_for_vdj_b_to_db',
  364. dag=dag,
  365. queue='hpc_4G',
  366. python_callable=load_analysis_files_func,
  367. params={'collection_name_task':'load_cellranger_result_to_db',
  368. 'collection_name_key':'sample_igf_id',
  369. 'file_name_task':'run_scirpy_for_vdj_b',
  370. 'file_name_key':'scirpy_notebook',
  371. 'analysis_name':'scirpy_vdj_b',
  372. 'collection_type':'SCIRPY_VDJ_B_HTML',
  373. 'collection_table':'sample',
  374. 'output_files_key':'output_db_files'})
  375. upload_scirpy_report_for_vdj_b_to_ftp = \
  376. PythonOperator(
  377. task_id='upload_scirpy_report_for_vdj_b_to_ftp',
  378. dag=dag,
  379. queue='hpc_4G',
  380. python_callable=ftp_files_upload_for_analysis,
  381. params={'xcom_pull_task':'load_scanpy_report_for_vdj_b_to_db',
  382. 'xcom_pull_files_key':'output_db_files',
  383. 'collection_name_task':'load_cellranger_result_to_db',
  384. 'collection_name_key':'sample_igf_id',
  385. 'collection_type':'FTP_SCIRPY_VDJ_B_HTML',
  386. 'collection_table':'sample',
  387. 'collect_remote_file':True})
  388. upload_scirpy_report_for_vdj_b_to_box = \
  389. PythonOperator(
  390. task_id='upload_scanpy_report_for_vdj_b_to_box',
  391. dag=dag,
  392. queue='hpc_4G',
  393. python_callable=upload_analysis_file_to_box,
  394. params={'xcom_pull_task':'load_scanpy_report_for_vdj_b_to_db',
  395. 'xcom_pull_files_key':'output_db_files',
  396. 'analysis_tag':'scirpy_vdj_b_single_sample_report'})
  397. ## PIPELINE
  398. decide_analysis_branch >> run_scirpy_for_vdj_b
  399. run_scirpy_for_vdj_b >> load_scirpy_report_for_vdj_b_to_db
  400. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_ftp
  401. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_box
  402. ## TASK
  403. run_scirpy_for_vdj_t = \
  404. PythonOperator(
  405. task_id='run_scirpy_for_vdj_t',
  406. dag=dag,
  407. queue='hpc_4G',
  408. python_callable=run_singlecell_notebook_wrapper_func,
  409. params={'cellranger_xcom_key':'cellranger_output',
  410. 'cellranger_xcom_pull_task':'run_cellranger',
  411. 'scanpy_timeout':1200,
  412. 'allow_errors':False,
  413. 'kernel_name':'python3',
  414. 'analysis_name':'scirpy',
  415. 'vdj_dir':'vdj_t',
  416. 'count_dir':'count',
  417. 'output_notebook_key':'scirpy_notebook',
  418. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  419. 'analysis_description_xcom_key':'analysis_description'})
  420. load_scirpy_report_for_vdj_t_to_db = \
  421. PythonOperator(
  422. task_id='load_scirpy_report_for_vdj_t_to_db',
  423. dag=dag,
  424. queue='hpc_4G',
  425. python_callable=load_analysis_files_func,
  426. params={'collection_name_task':'load_cellranger_result_to_db',
  427. 'collection_name_key':'sample_igf_id',
  428. 'file_name_task':'run_scirpy_for_vdj_t',
  429. 'file_name_key':'scirpy_notebook',
  430. 'analysis_name':'scirpy_vdj_t',
  431. 'collection_type':'SCIRPY_VDJ_T_HTML',
  432. 'collection_table':'sample',
  433. 'output_files_key':'output_db_files'})
  434. upload_scirpy_report_for_vdj_t_to_ftp = \
  435. PythonOperator(
  436. task_id='upload_scirpy_report_for_vdj_t_to_ftp',
  437. dag=dag,
  438. queue='hpc_4G',
  439. python_callable=ftp_files_upload_for_analysis,
  440. params={'xcom_pull_task':'load_scanpy_report_for_vdj_t_to_db',
  441. 'xcom_pull_files_key':'output_db_files',
  442. 'collection_name_task':'load_cellranger_result_to_db',
  443. 'collection_name_key':'sample_igf_id',
  444. 'collection_type':'FTP_SCIRPY_VDJ_T_HTML',
  445. 'collection_table':'sample',
  446. 'collect_remote_file':True})
  447. upload_scirpy_report_for_vdj_t_to_box = \
  448. PythonOperator(
  449. task_id='upload_scirpy_report_for_vdj_t_to_box',
  450. dag=dag,
  451. queue='hpc_4G',
  452. python_callable=upload_analysis_file_to_box,
  453. params={'xcom_pull_task':'load_scanpy_report_for_vdj_t_to_db',
  454. 'xcom_pull_files_key':'output_db_files',
  455. 'analysis_tag':'scirpy_vdj_t_single_sample_report'})
  456. ## PIPELINE
  457. decide_analysis_branch >> run_scirpy_for_vdj_t
  458. run_scirpy_for_vdj_t >> load_scirpy_report_for_vdj_t_to_db
  459. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_ftp
  460. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_box
  461. ## TASK
  462. run_seurat_for_sc_5p = \
  463. PythonOperator(
  464. task_id='run_seurat_for_sc_5p',
  465. dag=dag,
  466. queue='hpc_4G',
  467. python_callable=run_singlecell_notebook_wrapper_func,
  468. params={'cellranger_xcom_key':'cellranger_output',
  469. 'cellranger_xcom_pull_task':'run_cellranger',
  470. 'scanpy_timeout':1200,
  471. 'allow_errors':False,
  472. 'kernel_name':'R',
  473. 'analysis_name':'seurat',
  474. 'vdj_dir':'vdj',
  475. 'count_dir':'count',
  476. 'output_notebook_key':'seurat_notebook',
  477. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  478. 'analysis_description_xcom_key':'analysis_description'})
  479. load_seurat_report_for_sc_5p_db = \
  480. PythonOperator(
  481. task_id='load_seurat_report_for_sc_5p_db',
  482. dag=dag,
  483. queue='hpc_4G',
  484. python_callable=load_analysis_files_func,
  485. params={'collection_name_task':'load_cellranger_result_to_db',
  486. 'collection_name_key':'sample_igf_id',
  487. 'file_name_task':'run_seurat_for_sc_5p',
  488. 'file_name_key':'seurat_notebook',
  489. 'analysis_name':'seurat_5p',
  490. 'collection_type':'SEURAT_HTML',
  491. 'collection_table':'sample',
  492. 'output_files_key':'output_db_files'})
  493. upload_seurat_report_for_sc_5p_ftp = \
  494. PythonOperator(
  495. task_id='upload_seurat_report_for_sc_5p_ftp',
  496. dag=dag,
  497. queue='hpc_4G',
  498. python_callable=ftp_files_upload_for_analysis,
  499. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  500. 'xcom_pull_files_key':'output_db_files',
  501. 'collection_name_task':'load_cellranger_result_to_db',
  502. 'collection_name_key':'sample_igf_id',
  503. 'collection_type':'FTP_SEURAT_HTML',
  504. 'collection_table':'sample',
  505. 'collect_remote_file':True})
  506. upload_seurat_report_for_sc_5p_to_box = \
  507. PythonOperator(
  508. task_id='upload_seurat_report_for_sc_5p_to_box',
  509. dag=dag,
  510. queue='hpc_4G',
  511. python_callable=upload_analysis_file_to_box,
  512. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  513. 'xcom_pull_files_key':'output_db_files',
  514. 'analysis_tag':'seurat_single_sample_report'})
  515. ## PIPELINE
  516. decide_analysis_branch >> run_seurat_for_sc_5p
  517. run_seurat_for_sc_5p >> load_seurat_report_for_sc_5p_db
  518. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_ftp
  519. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_to_box
  520. ## TASK
  521. copy_bam_for_parallel_runs = \
  522. BranchPythonOperator(
  523. task_id='copy_bam_for_parallel_runs',
  524. dag=dag,
  525. queue='hpc_4G',
  526. python_callable=index_and_copy_bam_for_parallel_analysis,
  527. params={'xcom_pull_files_key':'cellranger_output',
  528. 'xcom_pull_task':'run_cellranger',
  529. 'list_of_tasks':[
  530. 'convert_cellranger_bam_to_cram',
  531. 'run_picard_alignment_summary',
  532. 'run_picard_qual_summary',
  533. 'run_picard_rna_summary',
  534. 'run_picard_gc_summary',
  535. 'run_picard_base_dist_summary',
  536. 'run_samtools_stats']
  537. })
  538. ## TASK
  539. convert_bam_to_cram = \
  540. PythonOperator(
  541. task_id='convert_cellranger_bam_to_cram',
  542. dag=dag,
  543. queue='hpc_4G4t',
  544. python_callable=convert_bam_to_cram_func,
  545. params={'xcom_pull_files_key':'convert_cellranger_bam_to_cram',
  546. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  547. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  548. 'analysis_description_xcom_key':'analysis_description',
  549. 'use_ephemeral_space':True,
  550. 'threads':4,
  551. 'analysis_name':'cellranger',
  552. 'collection_type':'ALIGNED_CRAM',
  553. 'collection_table':'sample',
  554. 'cram_files_xcom_key':'cram_files'})
  555. upload_cram_to_irods = \
  556. PythonOperator(
  557. task_id='upload_cram_to_irods',
  558. dag=dag,
  559. queue='hpc_4G',
  560. python_callable=irods_files_upload_for_analysis,
  561. params={'xcom_pull_task':'convert_cellranger_bam_to_cram',
  562. 'xcom_pull_files_key':'cram_files',
  563. 'collection_name_key':'sample_igf_id',
  564. 'analysis_name':'cellranger_multi'})
  565. ## PIPELINE
  566. decide_analysis_branch >> copy_bam_for_parallel_runs
  567. copy_bam_for_parallel_runs >> convert_bam_to_cram
  568. convert_bam_to_cram >> upload_cram_to_irods
  569. ## TASK
  570. run_picard_alignment_summary = \
  571. PythonOperator(
  572. task_id='run_picard_alignment_summary',
  573. dag=dag,
  574. queue='hpc_4G',
  575. python_callable=run_picard_for_cellranger,
  576. params={'xcom_pull_files_key':'cellranger_output',
  577. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  578. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  579. 'analysis_description_xcom_key':'analysis_description',
  580. 'use_ephemeral_space':True,
  581. 'load_metrics_to_cram':True,
  582. 'java_param':'-Xmx4g',
  583. 'picard_command':'CollectAlignmentSummaryMetrics',
  584. 'picard_option':{},
  585. 'analysis_files_xcom_key':'picard_alignment_summary',
  586. 'bam_files_xcom_key':None})
  587. ## PIPELINE
  588. copy_bam_for_parallel_runs >> run_picard_alignment_summary
  589. ## TASK
  590. run_picard_qual_summary = \
  591. PythonOperator(
  592. task_id='run_picard_qual_summary',
  593. dag=dag,
  594. queue='hpc_4G',
  595. python_callable=run_picard_for_cellranger,
  596. params={'xcom_pull_files_key':'cellranger_output',
  597. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  598. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  599. 'analysis_description_xcom_key':'analysis_description',
  600. 'use_ephemeral_space':True,
  601. 'load_metrics_to_cram':True,
  602. 'java_param':'-Xmx4g',
  603. 'picard_command':'QualityScoreDistribution',
  604. 'picard_option':{},
  605. 'analysis_files_xcom_key':'picard_qual_summary',
  606. 'bam_files_xcom_key':None})
  607. ## PIPELINE
  608. copy_bam_for_parallel_runs >> run_picard_qual_summary
  609. ## TASK
  610. run_picard_rna_summary = \
  611. PythonOperator(
  612. task_id='run_picard_rna_summary',
  613. dag=dag,
  614. queue='hpc_4G',
  615. python_callable=run_picard_for_cellranger,
  616. params={'xcom_pull_files_key':'cellranger_output',
  617. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  618. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  619. 'analysis_description_xcom_key':'analysis_description',
  620. 'use_ephemeral_space':True,
  621. 'load_metrics_to_cram':True,
  622. 'java_param':'-Xmx4g',
  623. 'picard_command':'CollectRnaSeqMetrics',
  624. 'picard_option':{},
  625. 'analysis_files_xcom_key':'picard_rna_summary',
  626. 'bam_files_xcom_key':None})
  627. ## PIPELINE
  628. copy_bam_for_parallel_runs >> run_picard_rna_summary
  629. ## TASK
  630. run_picard_gc_summary = \
  631. PythonOperator(
  632. task_id='run_picard_gc_summary',
  633. dag=dag,
  634. queue='hpc_4G',
  635. python_callable=run_picard_for_cellranger,
  636. params={'xcom_pull_files_key':'cellranger_output',
  637. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  638. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  639. 'analysis_description_xcom_key':'analysis_description',
  640. 'use_ephemeral_space':True,
  641. 'load_metrics_to_cram':True,
  642. 'java_param':'-Xmx4g',
  643. 'picard_command':'CollectGcBiasMetrics',
  644. 'picard_option':{},
  645. 'analysis_files_xcom_key':'picard_gc_summary',
  646. 'bam_files_xcom_key':None})
  647. ## PIPELINE
  648. copy_bam_for_parallel_runs >> run_picard_gc_summary
  649. ## TASK
  650. run_picard_base_dist_summary = \
  651. PythonOperator(
  652. task_id='run_picard_base_dist_summary',
  653. dag=dag,
  654. queue='hpc_4G',
  655. python_callable=run_picard_for_cellranger,
  656. params={'xcom_pull_files_key':'run_picard_base_dist_summary',
  657. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  658. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  659. 'analysis_description_xcom_key':'analysis_description',
  660. 'use_ephemeral_space':True,
  661. 'load_metrics_to_cram':True,
  662. 'java_param':'-Xmx4g',
  663. 'picard_command':'CollectBaseDistributionByCycle',
  664. 'picard_option':{},
  665. 'analysis_files_xcom_key':'picard_base_summary',
  666. 'bam_files_xcom_key':None})
  667. ## PIPELINE
  668. copy_bam_for_parallel_runs >> run_picard_base_dist_summary
  669. ## TASK
  670. run_samtools_stats = \
  671. PythonOperator(
  672. task_id='run_samtools_stats',
  673. dag=dag,
  674. queue='hpc_4G4t',
  675. python_callable=run_samtools_for_cellranger,
  676. params={'xcom_pull_files_key':'run_samtools_stats',
  677. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  678. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  679. 'analysis_description_xcom_key':'analysis_description',
  680. 'use_ephemeral_space':True,
  681. 'load_metrics_to_cram':True,
  682. 'samtools_command':'stats',
  683. 'threads':4,
  684. 'analysis_files_xcom_key':'samtools_stats'})
  685. ## PIPELINE
  686. copy_bam_for_parallel_runs >> run_samtools_stats
  687. ## TASK
  688. run_samtools_idxstats = \
  689. PythonOperator(
  690. task_id='run_samtools_idxstats',
  691. dag=dag,
  692. queue='hpc_4G4t',
  693. python_callable=run_samtools_for_cellranger,
  694. params={'xcom_pull_files_key':'run_samtools_stats',
  695. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  696. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  697. 'analysis_description_xcom_key':'analysis_description',
  698. 'use_ephemeral_space':True,
  699. 'load_metrics_to_cram':True,
  700. 'samtools_command':'idxstats',
  701. 'threads':4,
  702. 'analysis_files_xcom_key':'samtools_idxstats'})
  703. ## PIPELINE
  704. run_samtools_stats >> run_samtools_idxstats
  705. ## TASK
  706. run_multiqc = \
  707. PythonOperator(
  708. task_id='run_multiqc',
  709. dag=dag,
  710. queue='hpc_4G',
  711. trigger_rule='none_failed_or_skipped',
  712. python_callable=run_multiqc_for_cellranger,
  713. params={
  714. 'list_of_analysis_xcoms_and_tasks':{
  715. 'run_cellranger':'cellranger_output',
  716. 'run_picard_alignment_summary':'picard_alignment_summary',
  717. 'run_picard_qual_summary':'picard_qual_summary',
  718. 'run_picard_rna_summary':'picard_rna_summary',
  719. 'run_picard_gc_summary':'picard_gc_summary',
  720. 'run_picard_base_dist_summary':'picard_base_summary',
  721. 'run_samtools_stats':'samtools_stats',
  722. 'run_samtools_idxstats':'samtools_idxstats'},
  723. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  724. 'analysis_description_xcom_key':'analysis_description',
  725. 'use_ephemeral_space':True,
  726. 'multiqc_html_file_xcom_key':'multiqc_html',
  727. 'multiqc_data_file_xcom_key':'multiqc_data',
  728. 'tool_order_list':['picad','samtools']})
  729. ## PIPELINE
  730. run_picard_alignment_summary >> run_multiqc
  731. run_picard_qual_summary >> run_multiqc
  732. run_picard_rna_summary >> run_multiqc
  733. run_picard_gc_summary >> run_multiqc
  734. run_picard_base_dist_summary >> run_multiqc
  735. run_samtools_idxstats >> run_multiqc
  736. ## TASK
  737. load_multiqc_html = \
  738. PythonOperator(
  739. task_id='load_multiqc_html',
  740. dag=dag,
  741. queue='hpc_4G',
  742. python_callable=load_analysis_files_func,
  743. params={'collection_name_task':'load_cellranger_result_to_db',
  744. 'collection_name_key':'sample_igf_id',
  745. 'file_name_task':'run_multiqc',
  746. 'file_name_key':'multiqc_html',
  747. 'analysis_name':'multiqc',
  748. 'collection_type':'MULTIQC_HTML',
  749. 'collection_table':'sample',
  750. 'output_files_key':'output_db_files'})
  751. ## PIPELINE
  752. run_multiqc >> load_multiqc_html
  753. ## TASK
  754. upload_multiqc_to_ftp = \
  755. PythonOperator(
  756. task_id='upload_multiqc_to_ftp',
  757. dag=dag,
  758. queue='hpc_4G',
  759. python_callable=ftp_files_upload_for_analysis,
  760. params={'xcom_pull_task':'load_multiqc_html',
  761. 'xcom_pull_files_key':'output_db_files',
  762. 'collection_name_task':'load_cellranger_result_to_db',
  763. 'collection_name_key':'sample_igf_id',
  764. 'collection_type':'FTP_MULTIQC_HTML',
  765. 'collection_table':'sample',
  766. 'collect_remote_file':True})
  767. ## PIPELINE
  768. load_multiqc_html >> upload_multiqc_to_ftp
  769. ## TASK
  770. upload_multiqc_to_box = \
  771. PythonOperator(
  772. task_id='upload_multiqc_to_box',
  773. dag=dag,
  774. queue='hpc_4G',
  775. python_callable=upload_analysis_file_to_box,
  776. params={'xcom_pull_task':'load_multiqc_html',
  777. 'xcom_pull_files_key':'output_db_files',
  778. 'analysis_tag':'multiqc_report'})
  779. ## PIPELINE
  780. load_multiqc_html >> upload_multiqc_to_box
  781. ## TASK
  782. update_analysis_and_status = \
  783. PythonOperator(
  784. task_id='update_analysis_and_status',
  785. dag=dag,
  786. queue='hpc_4G',
  787. python_callable=change_pipeline_status,
  788. trigger_rule='none_failed_or_skipped',
  789. params={'new_status':'FINISHED',
  790. 'no_change_status':'SEEDED'})
  791. ## PIPELINE
  792. upload_multiqc_to_ftp >> update_analysis_and_status
  793. upload_scanpy_report_for_sc_5p_to_ftp >> update_analysis_and_status
  794. upload_scanpy_report_for_sc_5p_to_box >> update_analysis_and_status
  795. upload_cellbrowser_for_sc_5p_to_ftp >> update_analysis_and_status
  796. upload_scirpy_report_for_vdj_to_ftp >> update_analysis_and_status
  797. upload_scirpy_report_for_vdj_to_box >> update_analysis_and_status
  798. upload_scirpy_report_for_vdj_b_to_ftp >> update_analysis_and_status
  799. upload_scirpy_report_for_vdj_b_to_box >> update_analysis_and_status
  800. upload_scirpy_report_for_vdj_t_to_ftp >> update_analysis_and_status
  801. upload_scirpy_report_for_vdj_t_to_box >> update_analysis_and_status
  802. upload_seurat_report_for_sc_5p_ftp >> update_analysis_and_status
  803. upload_seurat_report_for_sc_5p_to_box >> update_analysis_and_status
  804. upload_cellranger_results_to_irods >> update_analysis_and_status
  805. upload_cellranger_report_to_ftp >> update_analysis_and_status
  806. upload_cellranger_report_to_box >> update_analysis_and_status
  807. upload_cram_to_irods >> update_analysis_and_status