dag9_tenx_single_cell_immune_profiling.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. from datetime import timedelta
  2. import os,json,logging,subprocess
  3. from airflow.models import DAG,Variable
  4. from airflow.utils.dates import days_ago
  5. from airflow.operators.python_operator import PythonOperator
  6. from airflow.operators.python_operator import BranchPythonOperator
  7. from airflow.operators.dummy_operator import DummyOperator
  8. from igf_airflow.logging.upload_log_msg import send_log_to_channels,log_success,log_failure,log_sleep
  9. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import fetch_analysis_info_and_branch_func
  10. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import configure_cellranger_run_func
  11. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_sc_read_trimmming_func
  12. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_cellranger_tool
  13. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import decide_analysis_branch_func
  14. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_result_to_db_func
  15. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import ftp_files_upload_for_analysis
  16. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import irods_files_upload_for_analysis
  17. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_scanpy_for_sc_5p_func
  18. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_scirpy_for_vdj_func
  19. ## ARGS
  20. default_args = {
  21. 'owner': 'airflow',
  22. 'depends_on_past': False,
  23. 'start_date': days_ago(2),
  24. 'email_on_failure': False,
  25. 'email_on_retry': False,
  26. 'retries': 4,
  27. 'max_active_runs':10,
  28. 'catchup':False,
  29. 'retry_delay': timedelta(minutes=5),
  30. 'provide_context': True,
  31. }
  32. FEATURE_TYPE_LIST = Variable.get('tenx_single_cell_immune_profiling_feature_types')
  33. ## DAG
  34. dag = \
  35. DAG(
  36. dag_id='dag9_tenx_single_cell_immune_profiling',
  37. schedule_interval=None,
  38. tags=['hpc','analysis','tenx','sc'],
  39. default_args=default_args,
  40. orientation='LR')
  41. with dag:
  42. ## TASK
  43. fetch_analysis_info_and_branch = \
  44. BranchPythonOperator(
  45. task_id='fetch_analysis_info',
  46. dag=dag,
  47. queue='hpc_4g',
  48. params={'no_analysis_task':'no_analysis',
  49. 'analysis_description_xcom_key':'analysis_description',
  50. 'analysis_info_xcom_key':'analysis_info'},
  51. python_callable=fetch_analysis_info_and_branch_func)
  52. ## TASK
  53. configure_cellranger_run = \
  54. PythonOperator(
  55. task_id='configure_cellranger_run',
  56. dag=dag,
  57. queue='hpc_4g',
  58. params={'xcom_pull_task_id':'fetch_analysis_info_and_branch',
  59. 'analysis_description_xcom_key':'analysis_description',
  60. 'analysis_info_xcom_key':'analysis_info',
  61. 'library_csv_xcom_key':'cellranger_library_csv'},
  62. python_callable=configure_cellranger_run_func)
  63. for analysis_name in FEATURE_TYPE_LIST:
  64. ## TASK
  65. task_branch = \
  66. DummyOperator(
  67. task_id=analysis_name,
  68. dag=dag)
  69. run_trim_list = list()
  70. for run_id in range(0,32):
  71. ## TASK
  72. t = \
  73. PythonOperator(
  74. task_id='run_trim_{0}_{1}'.format(analysis_name,run_id),
  75. dag=dag,
  76. params={'xcom_pull_task_id':'fetch_analysis_info_and_branch',
  77. 'analysis_info_xcom_key':'analysis_info',
  78. 'analysis_name':analysis_name,
  79. 'run_id':run_id,
  80. 'r1_length':26,
  81. 'r2_length':0,
  82. 'fastq_input_dir_tag':'fastq_dir',
  83. 'fastq_output_dir_tag':'output_path'},
  84. python_callable=run_sc_read_trimmming_func)
  85. run_trim_list.append(t)
  86. ## TASK
  87. collect_trimmed_files = \
  88. DummyOperator(
  89. task_id='collect_trimmed_files_{0}'.format(analysis_name),
  90. dag=dag)
  91. ## PIPELINE
  92. fetch_analysis_info_and_branch >> task_branch
  93. task_branch >> run_trim_list
  94. run_trim_list >> collect_trimmed_files
  95. collect_trimmed_files >> configure_cellranger_run
  96. ## TASK
  97. no_analysis = \
  98. DummyOperator(
  99. task_id='no_analysis',
  100. dag=dag)
  101. ## PIPELINE
  102. fetch_analysis_info_and_branch >> no_analysis
  103. ## TASK
  104. run_cellranger = \
  105. PythonOperator(
  106. task_id='run_cellranger',
  107. dag=dag,
  108. params={'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  109. 'analysis_description_xcom_key':'analysis_description',
  110. 'library_csv_xcom_key':'cellranger_library_csv',
  111. 'library_csv_xcom_pull_task':'configure_cellranger_run',
  112. 'cellranger_xcom_key':'cellranger_output',
  113. 'cellranger_options':['--localcores 8','--localmem 64']},
  114. python_callable=run_cellranger_tool)
  115. ## PIPELINE
  116. configure_cellranger_run >> run_cellranger
  117. ## TASK
  118. decide_analysis_branch = \
  119. BranchPythonOperator(
  120. task_id='decide_analysis_branch',
  121. dag=dag,
  122. python_callable=decide_analysis_branch_func,
  123. params={'load_cellranger_result_to_db_task':'load_cellranger_result_to_db',
  124. 'run_scanpy_for_sc_5p_task':'run_scanpy_for_sc_5p',
  125. 'run_scirpy_for_vdj_task':'run_scirpy_for_vdj',
  126. 'run_scirpy_for_vdj_b_task':'run_scirpy_for_vdj_b',
  127. 'run_scirpy_vdj_t_task':'run_scirpy_vdj_t',
  128. 'run_seurat_for_sc_5p_task':'run_seurat_for_sc_5p',
  129. 'run_picard_alignment_summary_task':'run_picard_alignment_summary',
  130. 'convert_bam_to_cram_task':'convert_bam_to_cram',
  131. 'library_csv_xcom_key':'cellranger_library_csv',
  132. 'library_csv_xcom_pull_task':'configure_cellranger_run'})
  133. ## PIPELINE
  134. run_cellranger >> decide_analysis_branch
  135. ## TASK
  136. load_cellranger_result_to_db = \
  137. PythonOperator(
  138. task_id='load_cellranger_result_to_db',
  139. dag=dag,
  140. python_callable=load_cellranger_result_to_db_func,
  141. params={'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  142. 'analysis_description_xcom_key':'analysis_description',
  143. 'cellranger_xcom_key':'cellranger_output',
  144. 'cellranger_xcom_pull_task':'run_cellranger',
  145. 'collection_type':'CELLRANGER_MULTI',
  146. 'collection_table':'sample',
  147. 'xcom_collection_name_key':'sample_igf_id',
  148. 'genome_column':'genome_build',
  149. 'analysis_name':'cellranger_multi',
  150. 'output_xcom_key':'loaded_output_files',
  151. 'html_xcom_key':'html_report_file',
  152. 'html_report_file_name':'web_summary.html'})
  153. upload_report_to_ftp = \
  154. PythonOperator(
  155. task_id='upload_report_to_ftp',
  156. dag=dag,
  157. python_callable=ftp_files_upload_for_analysis,
  158. params={'xcom_pull_task':'load_cellranger_result_to_db',
  159. 'xcom_pull_files_key':'html_report_file',
  160. 'collection_name_key':'sample_igf_id',
  161. 'collection_type':'FTP_CELLRANGER_MULTI',
  162. 'collection_table':'sample',
  163. 'collect_remote_file':True})
  164. upload_report_to_box = \
  165. DummyOperator(
  166. task_id='upload_report_to_box',
  167. dag=dag,
  168. params={'xcom_pull_task':'load_cellranger_result_to_db',
  169. 'xcom_pull_files_key':'html_report_file'})
  170. upload_results_to_irods = \
  171. PythonOperator(
  172. task_id='upload_results_to_irods',
  173. dag=dag,
  174. python_callable=irods_files_upload_for_analysis,
  175. params={'xcom_pull_task':'load_cellranger_result_to_db',
  176. 'xcom_pull_files_key':'loaded_output_files',
  177. 'collection_name_key':'sample_igf_id',
  178. 'analysis_name':'cellranger_multi'})
  179. ## PIPELINE
  180. decide_analysis_branch >> load_cellranger_result_to_db
  181. load_cellranger_result_to_db >> upload_report_to_ftp
  182. load_cellranger_result_to_db >> upload_report_to_box
  183. load_cellranger_result_to_db >> upload_results_to_irods
  184. ## TASK
  185. run_scanpy_for_sc_5p = \
  186. PythonOperator(
  187. task_id='run_scanpy_for_sc_5p',
  188. dag=dag,
  189. python_callable=run_scanpy_for_sc_5p_func,
  190. params={'cellranger_xcom_key':'cellranger_output',
  191. 'cellranger_xcom_pull_task':'run_cellranger',
  192. 'scanpy_timeout':1200,
  193. 'allow_errors':False,
  194. 'output_notebook_key':'scanpy_notebook',
  195. 'output_cellbrowser_key':'cellbrowser_dirs',
  196. 'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  197. 'analysis_description_xcom_key':'analysis_description'})
  198. upload_scanpy_report_for_sc_5p_to_ftp = \
  199. DummyOperator(
  200. task_id='upload_scanpy_report_for_sc_5p_to_ftp',
  201. dag=dag)
  202. upload_scanpy_report_for_sc_5p_to_box = \
  203. DummyOperator(
  204. task_id='upload_scanpy_report_for_sc_5p_to_box',
  205. dag=dag)
  206. upload_cellbrowser_for_sc_5p_to_ftp = \
  207. DummyOperator(
  208. task_id='upload_cellbrowser_for_sc_5p_to_ftp',
  209. dag=dag)
  210. ## PIPELINE
  211. decide_analysis_branch >> run_scanpy_for_sc_5p
  212. run_scanpy_for_sc_5p >> upload_scanpy_report_for_sc_5p_to_ftp
  213. run_scanpy_for_sc_5p >> upload_scanpy_report_for_sc_5p_to_box
  214. run_scanpy_for_sc_5p >> upload_cellbrowser_for_sc_5p_to_ftp
  215. ## TASK
  216. run_scirpy_for_vdj = \
  217. PythonOperator(
  218. task_id='run_scirpy_for_vdj',
  219. dag=dag,
  220. python_callable=run_scirpy_for_vdj_func,
  221. params={'cellranger_xcom_key':'cellranger_output',
  222. 'cellranger_xcom_pull_task':'run_cellranger',
  223. 'scanpy_timeout':1200,
  224. 'allow_errors':False,
  225. 'vdj_dir':'vdj',
  226. 'output_notebook_key':'scirpy_notebook',
  227. 'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  228. 'analysis_description_xcom_key':'analysis_description'})
  229. upload_scanpy_report_for_vdj_to_ftp = \
  230. DummyOperator(
  231. task_id='upload_scanpy_report_for_vdj_to_ftp',
  232. dag=dag)
  233. upload_scanpy_report_for_vdj_to_box = \
  234. DummyOperator(
  235. task_id='upload_scanpy_report_for_vdj_to_box',
  236. dag=dag)
  237. ## PIPELINE
  238. decide_analysis_branch >> run_scirpy_for_vdj
  239. run_scirpy_for_vdj >> upload_scanpy_report_for_vdj_to_ftp
  240. run_scirpy_for_vdj >> upload_scanpy_report_for_vdj_to_box
  241. ## TASK
  242. run_scirpy_for_vdj_b = \
  243. DummyOperator(
  244. task_id='run_scirpy_for_vdj_b',
  245. dag=dag,
  246. python_callable=run_scirpy_for_vdj_func,
  247. params={'cellranger_xcom_key':'cellranger_output',
  248. 'cellranger_xcom_pull_task':'run_cellranger',
  249. 'scanpy_timeout':1200,
  250. 'allow_errors':False,
  251. 'vdj_dir':'vdj_b',
  252. 'output_notebook_key':'scirpy_notebook',
  253. 'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  254. 'analysis_description_xcom_key':'analysis_description'})
  255. upload_scanpy_report_for_vdj_b_to_ftp = \
  256. DummyOperator(
  257. task_id='upload_scanpy_report_for_vdj_b_to_ftp',
  258. dag=dag)
  259. upload_scanpy_report_for_vdj_b_to_box = \
  260. DummyOperator(
  261. task_id='upload_scanpy_report_for_vdj_b_to_box',
  262. dag=dag)
  263. ## PIPELINE
  264. decide_analysis_branch >> run_scirpy_for_vdj_b
  265. run_scirpy_for_vdj_b >> upload_scanpy_report_for_vdj_b_to_ftp
  266. run_scirpy_for_vdj_b >> upload_scanpy_report_for_vdj_b_to_box
  267. ## TASK
  268. run_scirpy_for_vdj_t = \
  269. DummyOperator(
  270. task_id='run_scirpy_for_vdj_t',
  271. dag=dag,
  272. python_callable=run_scirpy_for_vdj_func,
  273. params={'cellranger_xcom_key':'cellranger_output',
  274. 'cellranger_xcom_pull_task':'run_cellranger',
  275. 'timeout':1200,
  276. 'allow_errors':False,
  277. 'vdj_dir':'vdj_t',
  278. 'output_notebook_key':'scirpy_notebook',
  279. 'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  280. 'analysis_description_xcom_key':'analysis_description'})
  281. upload_scanpy_report_for_vdj_t_to_ftp = \
  282. DummyOperator(
  283. task_id='upload_scanpy_report_for_vdj_t_to_ftp',
  284. dag=dag)
  285. upload_scanpy_report_for_vdj_t_to_box = \
  286. DummyOperator(
  287. task_id='upload_scanpy_report_for_vdj_t_to_box',
  288. dag=dag)
  289. ## PIPELINE
  290. decide_analysis_branch >> run_scirpy_for_vdj_t
  291. run_scirpy_for_vdj_t >> upload_scanpy_report_for_vdj_t_to_ftp
  292. run_scirpy_for_vdj_t >> upload_scanpy_report_for_vdj_t_to_box
  293. ## TASK
  294. run_seurat_for_sc_5p = \
  295. DummyOperator(
  296. task_id='run_seurat_for_sc_5p',
  297. dag=dag,
  298. params={'cellranger_xcom_key':'cellranger_output',
  299. 'cellranger_xcom_pull_task':'run_cellranger'})
  300. upload_seurat_report_for_sc_5p_ftp = \
  301. DummyOperator(
  302. task_id='upload_seurat_report_for_sc_5p_ftp',
  303. dag=dag)
  304. upload_seurat_report_for_sc_5p_to_box = \
  305. DummyOperator(
  306. task_id='upload_seurat_report_for_sc_5p_to_box',
  307. dag=dag)
  308. ## PIPELINE
  309. decide_analysis_branch >> run_seurat_for_sc_5p
  310. run_seurat_for_sc_5p >> upload_seurat_report_for_sc_5p_ftp
  311. run_seurat_for_sc_5p >> upload_seurat_report_for_sc_5p_to_box
  312. ## TASK
  313. convert_bam_to_cram = \
  314. DummyOperator(
  315. task_id='convert_bam_to_cram',
  316. dag=dag,
  317. params={'cellranger_xcom_key':'cellranger_output',
  318. 'cellranger_xcom_pull_task':'run_cellranger'})
  319. upload_cram_to_irods = \
  320. DummyOperator(
  321. task_id='upload_cram_to_irods',
  322. dag=dag)
  323. ## PIPELINE
  324. decide_analysis_branch >> convert_bam_to_cram
  325. convert_bam_to_cram >> upload_cram_to_irods
  326. ## TASK
  327. run_picard_alignment_summary = \
  328. DummyOperator(
  329. task_id='run_picard_alignment_summary',
  330. dag=dag,
  331. params={'cellranger_xcom_key':'cellranger_output',
  332. 'cellranger_xcom_pull_task':'run_cellranger'})
  333. ## PIPELINE
  334. decide_analysis_branch >> run_picard_alignment_summary
  335. ## TASK
  336. run_picard_qual_summary = \
  337. DummyOperator(
  338. task_id='run_picard_qual_summary',
  339. dag=dag)
  340. ## PIPELINE
  341. run_picard_alignment_summary >> run_picard_qual_summary
  342. ## TASK
  343. run_picard_rna_summary = \
  344. DummyOperator(
  345. task_id='run_picard_rna_summary',
  346. dag=dag)
  347. ## PIPELINE
  348. run_picard_qual_summary >> run_picard_rna_summary
  349. ## TASK
  350. run_picard_gc_summary = \
  351. DummyOperator(
  352. task_id='run_picard_gc_summary',
  353. dag=dag)
  354. ## PIPELINE
  355. run_picard_rna_summary >> run_picard_gc_summary
  356. ## TASK
  357. run_samtools_stats = \
  358. DummyOperator(
  359. task_id='run_samtools_stats',
  360. dag=dag)
  361. ## PIPELINE
  362. run_picard_gc_summary >> run_samtools_stats
  363. ## TASK
  364. run_samtools_idxstats = \
  365. DummyOperator(
  366. task_id='run_samtools_idxstats',
  367. dag=dag)
  368. ## PIPELINE
  369. run_samtools_stats >> run_samtools_idxstats
  370. ## TASK
  371. run_multiqc = \
  372. DummyOperator(
  373. task_id='run_multiqc',
  374. dag=dag)
  375. ## PIPELINE
  376. run_samtools_idxstats >> run_multiqc
  377. ## TASK
  378. upload_multiqc_to_ftp = \
  379. DummyOperator(
  380. task_id='upload_multiqc_to_ftp',
  381. dag=dag)
  382. ## PIPELINE
  383. run_multiqc >> upload_multiqc_to_ftp
  384. ## TASK
  385. upload_multiqc_to_box = \
  386. DummyOperator(
  387. task_id='upload_multiqc_to_box',
  388. dag=dag)
  389. ## PIPELINE
  390. run_multiqc >> upload_multiqc_to_box
  391. ## TASK
  392. update_analysis_and_status = \
  393. DummyOperator(
  394. task_id='update_analysis_and_status',
  395. dag=dag)
  396. ## PIPELINE
  397. upload_multiqc_to_ftp >> update_analysis_and_status
  398. upload_scanpy_report_for_sc_5p_to_ftp >> update_analysis_and_status
  399. upload_scanpy_report_for_sc_5p_to_box >> update_analysis_and_status
  400. upload_cellbrowser_for_sc_5p_to_ftp >> update_analysis_and_status
  401. upload_scanpy_report_for_vdj_to_ftp >> update_analysis_and_status
  402. upload_scanpy_report_for_vdj_to_box >> update_analysis_and_status
  403. upload_scanpy_report_for_vdj_b_to_ftp >> update_analysis_and_status
  404. upload_scanpy_report_for_vdj_b_to_box >> update_analysis_and_status
  405. upload_scanpy_report_for_vdj_t_to_ftp >> update_analysis_and_status
  406. upload_scanpy_report_for_vdj_t_to_box >> update_analysis_and_status
  407. upload_seurat_report_for_sc_5p_ftp >> update_analysis_and_status
  408. upload_seurat_report_for_sc_5p_to_box >> update_analysis_and_status
  409. upload_results_to_irods >> update_analysis_and_status
  410. upload_report_to_ftp >> update_analysis_and_status
  411. upload_report_to_box >> update_analysis_and_status
  412. upload_cram_to_irods >> update_analysis_and_status