dag9_tenx_single_cell_immune_profiling.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. from datetime import timedelta
  2. import os,json,logging,subprocess
  3. from airflow.models import DAG,Variable
  4. from airflow.utils.dates import days_ago
  5. from airflow.operators.python_operator import PythonOperator
  6. from airflow.operators.python_operator import BranchPythonOperator
  7. from airflow.operators.dummy_operator import DummyOperator
  8. from igf_airflow.logging.upload_log_msg import send_log_to_channels,log_success,log_failure,log_sleep
  9. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import fetch_analysis_info_and_branch_func
  10. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import configure_cellranger_run_func
  11. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_sc_read_trimmming_func
  12. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_cellranger_tool
  13. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import decide_analysis_branch_func
  14. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_result_to_db_func
  15. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import ftp_files_upload_for_analysis
  16. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import irods_files_upload_for_analysis
  17. ## ARGS
  18. default_args = {
  19. 'owner': 'airflow',
  20. 'depends_on_past': False,
  21. 'start_date': days_ago(2),
  22. 'email_on_failure': False,
  23. 'email_on_retry': False,
  24. 'retries': 4,
  25. 'max_active_runs':10,
  26. 'catchup':False,
  27. 'retry_delay': timedelta(minutes=5),
  28. 'provide_context': True,
  29. }
  30. FEATURE_TYPE_LIST = Variable.get('tenx_single_cell_immune_profiling_feature_types')
  31. ## DAG
  32. dag = \
  33. DAG(
  34. dag_id='dag9_tenx_single_cell_immune_profiling',
  35. schedule_interval=None,
  36. tags=['hpc','analysis','tenx','sc'],
  37. default_args=default_args,
  38. orientation='LR')
  39. with dag:
  40. ## TASK
  41. fetch_analysis_info_and_branch = \
  42. BranchPythonOperator(
  43. task_id='fetch_analysis_info',
  44. dag=dag,
  45. queue='hpc_4g',
  46. params={'no_analysis_task':'no_analysis',
  47. 'analysis_description_xcom_key':'analysis_description',
  48. 'analysis_info_xcom_key':'analysis_info'},
  49. python_callable=fetch_analysis_info_and_branch_func)
  50. ## TASK
  51. configure_cellranger_run = \
  52. PythonOperator(
  53. task_id='configure_cellranger_run',
  54. dag=dag,
  55. queue='hpc_4g',
  56. params={'xcom_pull_task_id':'fetch_analysis_info_and_branch',
  57. 'analysis_description_xcom_key':'analysis_description',
  58. 'analysis_info_xcom_key':'analysis_info',
  59. 'library_csv_xcom_key':'cellranger_library_csv'},
  60. python_callable=configure_cellranger_run_func)
  61. for analysis_name in FEATURE_TYPE_LIST:
  62. ## TASK
  63. task_branch = \
  64. DummyOperator(
  65. task_id=analysis_name,
  66. dag=dag)
  67. run_trim_list = list()
  68. for run_id in range(0,32):
  69. ## TASK
  70. t = \
  71. PythonOperator(
  72. task_id='run_trim_{0}_{1}'.format(analysis_name,run_id),
  73. dag=dag,
  74. params={'xcom_pull_task_id':'fetch_analysis_info_and_branch',
  75. 'analysis_info_xcom_key':'analysis_info',
  76. 'analysis_name':analysis_name,
  77. 'run_id':run_id,
  78. 'r1_length':26,
  79. 'r2_length':0,
  80. 'fastq_input_dir_tag':'fastq_dir',
  81. 'fastq_output_dir_tag':'output_path'},
  82. python_callable=run_sc_read_trimmming_func)
  83. run_trim_list.append(t)
  84. ## TASK
  85. collect_trimmed_files = \
  86. DummyOperator(
  87. task_id='collect_trimmed_files_{0}'.format(analysis_name),
  88. dag=dag)
  89. ## PIPELINE
  90. fetch_analysis_info_and_branch >> task_branch
  91. task_branch >> run_trim_list
  92. run_trim_list >> collect_trimmed_files
  93. collect_trimmed_files >> configure_cellranger_run
  94. ## TASK
  95. no_analysis = \
  96. DummyOperator(
  97. task_id='no_analysis',
  98. dag=dag)
  99. ## PIPELINE
  100. fetch_analysis_info_and_branch >> no_analysis
  101. ## TASK
  102. run_cellranger = \
  103. PythonOperator(
  104. task_id='run_cellranger',
  105. dag=dag,
  106. params={'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  107. 'analysis_description_xcom_key':'analysis_description',
  108. 'library_csv_xcom_key':'cellranger_library_csv',
  109. 'library_csv_xcom_pull_task':'configure_cellranger_run',
  110. 'cellranger_xcom_key':'cellranger_output',
  111. 'cellranger_options':['--localcores 8','--localmem 64']},
  112. python_callable=run_cellranger_tool)
  113. ## PIPELINE
  114. configure_cellranger_run >> run_cellranger
  115. ## TASK
  116. decide_analysis_branch = \
  117. BranchPythonOperator(
  118. task_id='decide_analysis_branch',
  119. dag=dag,
  120. python_callable=decide_analysis_branch_func,
  121. params={'load_cellranger_result_to_db_task':'load_cellranger_result_to_db',
  122. 'run_scanpy_for_sc_5p_task':'run_scanpy_for_sc_5p',
  123. 'run_scirpy_for_vdj_task':'run_scirpy_for_vdj',
  124. 'run_scirpy_for_vdj_b_task':'run_scirpy_for_vdj_b',
  125. 'run_scirpy_vdj_t_task':'run_scirpy_vdj_t',
  126. 'run_seurat_for_sc_5p_task':'run_seurat_for_sc_5p',
  127. 'run_picard_alignment_summary_task':'run_picard_alignment_summary',
  128. 'convert_bam_to_cram_task':'convert_bam_to_cram',
  129. 'library_csv_xcom_key':'cellranger_library_csv',
  130. 'library_csv_xcom_pull_task':'configure_cellranger_run'})
  131. ## PIPELINE
  132. run_cellranger >> decide_analysis_branch
  133. ## TASK
  134. load_cellranger_result_to_db = \
  135. PythonOperator(
  136. task_id='load_cellranger_result_to_db',
  137. dag=dag,
  138. python_callable=load_cellranger_result_to_db_func,
  139. params={'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  140. 'analysis_description_xcom_key':'analysis_description',
  141. 'cellranger_xcom_key':'cellranger_output',
  142. 'cellranger_xcom_pull_task':'run_cellranger',
  143. 'collection_type':'CELLRANGER_MULTI',
  144. 'collection_table':'sample',
  145. 'xcom_collection_name_key':'sample_igf_id',
  146. 'genome_column':'genome_build',
  147. 'analysis_name':'cellranger_multi',
  148. 'output_xcom_key':'loaded_output_files',
  149. 'html_xcom_key':'html_report_file',
  150. 'html_report_file_name':'web_summary.html'})
  151. upload_report_to_ftp = \
  152. PythonOperator(
  153. task_id='upload_report_to_ftp',
  154. dag=dag,
  155. python_callable=ftp_files_upload_for_analysis,
  156. params={'xcom_pull_task':'load_cellranger_result_to_db',
  157. 'xcom_pull_files_key':'html_report_file',
  158. 'collection_name_key':'sample_igf_id',
  159. 'collection_type':'FTP_CELLRANGER_MULTI',
  160. 'collection_table':'sample',
  161. 'collect_remote_file':True})
  162. upload_report_to_box = \
  163. DummyOperator(
  164. task_id='upload_report_to_box',
  165. dag=dag,
  166. params={'xcom_pull_task':'load_cellranger_result_to_db',
  167. 'xcom_pull_files_key':'html_report_file'})
  168. upload_results_to_irods = \
  169. PythonOperator(
  170. task_id='upload_results_to_irods',
  171. dag=dag,
  172. python_callable=irods_files_upload_for_analysis,
  173. params={'xcom_pull_task':'load_cellranger_result_to_db',
  174. 'xcom_pull_files_key':'loaded_output_files',
  175. 'collection_name_key':'sample_igf_id',
  176. 'analysis_name':'cellranger_multi'})
  177. ## PIPELINE
  178. decide_analysis_branch >> load_cellranger_result_to_db
  179. load_cellranger_result_to_db >> upload_report_to_ftp
  180. load_cellranger_result_to_db >> upload_report_to_box
  181. load_cellranger_result_to_db >> upload_results_to_irods
  182. ## TASK
  183. run_scanpy_for_sc_5p = \
  184. DummyOperator(
  185. task_id='run_scanpy_for_sc_5p',
  186. dag=dag,
  187. params={'cellranger_xcom_key':'cellranger_output',
  188. 'cellranger_xcom_pull_task':'run_cellranger'})
  189. upload_scanpy_report_for_sc_5p_to_ftp = \
  190. DummyOperator(
  191. task_id='upload_scanpy_report_for_sc_5p_to_ftp',
  192. dag=dag)
  193. upload_scanpy_report_for_sc_5p_to_box = \
  194. DummyOperator(
  195. task_id='upload_scanpy_report_for_sc_5p_to_box',
  196. dag=dag)
  197. run_cellbrowser_for_sc_5p = \
  198. DummyOperator(
  199. task_id='run_cellbrowser_for_sc_5p',
  200. dag=dag)
  201. upload_cellbrowser_for_sc_5p_to_ftp = \
  202. DummyOperator(
  203. task_id='upload_cellbrowser_for_sc_5p_to_ftp',
  204. dag=dag)
  205. ## PIPELINE
  206. decide_analysis_branch >> run_scanpy_for_sc_5p
  207. run_scanpy_for_sc_5p >> upload_scanpy_report_for_sc_5p_to_ftp
  208. run_scanpy_for_sc_5p >> upload_scanpy_report_for_sc_5p_to_box
  209. run_scanpy_for_sc_5p >> run_cellbrowser_for_sc_5p
  210. run_cellbrowser_for_sc_5p >> upload_cellbrowser_for_sc_5p_to_ftp
  211. ## TASK
  212. run_scirpy_for_vdj = \
  213. DummyOperator(
  214. task_id='run_scirpy_for_vdj',
  215. dag=dag,
  216. params={'cellranger_xcom_key':'cellranger_output',
  217. 'cellranger_xcom_pull_task':'run_cellranger'})
  218. upload_scanpy_report_for_vdj_to_ftp = \
  219. DummyOperator(
  220. task_id='upload_scanpy_report_for_vdj_to_ftp',
  221. dag=dag)
  222. upload_scanpy_report_for_vdj_to_box = \
  223. DummyOperator(
  224. task_id='upload_scanpy_report_for_vdj_to_box',
  225. dag=dag)
  226. ## PIPELINE
  227. decide_analysis_branch >> run_scirpy_for_vdj
  228. run_scirpy_for_vdj >> upload_scanpy_report_for_vdj_to_ftp
  229. run_scirpy_for_vdj >> upload_scanpy_report_for_vdj_to_box
  230. ## TASK
  231. run_scirpy_for_vdj_b = \
  232. DummyOperator(
  233. task_id='run_scirpy_for_vdj_b',
  234. dag=dag,
  235. params={'cellranger_xcom_key':'cellranger_output',
  236. 'cellranger_xcom_pull_task':'run_cellranger'})
  237. upload_scanpy_report_for_vdj_b_to_ftp = \
  238. DummyOperator(
  239. task_id='upload_scanpy_report_for_vdj_b_to_ftp',
  240. dag=dag)
  241. upload_scanpy_report_for_vdj_b_to_box = \
  242. DummyOperator(
  243. task_id='upload_scanpy_report_for_vdj_b_to_box',
  244. dag=dag)
  245. ## PIPELINE
  246. decide_analysis_branch >> run_scirpy_for_vdj_b
  247. run_scirpy_for_vdj_b >> upload_scanpy_report_for_vdj_b_to_ftp
  248. run_scirpy_for_vdj_b >> upload_scanpy_report_for_vdj_b_to_box
  249. ## TASK
  250. run_scirpy_for_vdj_t = \
  251. DummyOperator(
  252. task_id='run_scirpy_for_vdj_t',
  253. dag=dag,
  254. params={'cellranger_xcom_key':'cellranger_output',
  255. 'cellranger_xcom_pull_task':'run_cellranger'})
  256. upload_scanpy_report_for_vdj_t_to_ftp = \
  257. DummyOperator(
  258. task_id='upload_scanpy_report_for_vdj_t_to_ftp',
  259. dag=dag)
  260. upload_scanpy_report_for_vdj_t_to_box = \
  261. DummyOperator(
  262. task_id='upload_scanpy_report_for_vdj_t_to_box',
  263. dag=dag)
  264. ## PIPELINE
  265. decide_analysis_branch >> run_scirpy_for_vdj_t
  266. run_scirpy_for_vdj_t >> upload_scanpy_report_for_vdj_t_to_ftp
  267. run_scirpy_for_vdj_t >> upload_scanpy_report_for_vdj_t_to_box
  268. ## TASK
  269. run_seurat_for_sc_5p = \
  270. DummyOperator(
  271. task_id='run_seurat_for_sc_5p',
  272. dag=dag,
  273. params={'cellranger_xcom_key':'cellranger_output',
  274. 'cellranger_xcom_pull_task':'run_cellranger'})
  275. upload_seurat_report_for_sc_5p_ftp = \
  276. DummyOperator(
  277. task_id='upload_seurat_report_for_sc_5p_ftp',
  278. dag=dag)
  279. upload_seurat_report_for_sc_5p_to_box = \
  280. DummyOperator(
  281. task_id='upload_seurat_report_for_sc_5p_to_box',
  282. dag=dag)
  283. ## PIPELINE
  284. decide_analysis_branch >> run_seurat_for_sc_5p
  285. run_seurat_for_sc_5p >> upload_seurat_report_for_sc_5p_ftp
  286. run_seurat_for_sc_5p >> upload_seurat_report_for_sc_5p_to_box
  287. ## TASK
  288. convert_bam_to_cram = \
  289. DummyOperator(
  290. task_id='convert_bam_to_cram',
  291. dag=dag,
  292. params={'cellranger_xcom_key':'cellranger_output',
  293. 'cellranger_xcom_pull_task':'run_cellranger'})
  294. upload_cram_to_irods = \
  295. DummyOperator(
  296. task_id='upload_cram_to_irods',
  297. dag=dag)
  298. ## PIPELINE
  299. decide_analysis_branch >> convert_bam_to_cram
  300. convert_bam_to_cram >> upload_cram_to_irods
  301. ## TASK
  302. run_picard_alignment_summary = \
  303. DummyOperator(
  304. task_id='run_picard_alignment_summary',
  305. dag=dag,
  306. params={'cellranger_xcom_key':'cellranger_output',
  307. 'cellranger_xcom_pull_task':'run_cellranger'})
  308. ## PIPELINE
  309. decide_analysis_branch >> run_picard_alignment_summary
  310. ## TASK
  311. run_picard_qual_summary = \
  312. DummyOperator(
  313. task_id='run_picard_qual_summary',
  314. dag=dag)
  315. ## PIPELINE
  316. run_picard_alignment_summary >> run_picard_qual_summary
  317. ## TASK
  318. run_picard_rna_summary = \
  319. DummyOperator(
  320. task_id='run_picard_rna_summary',
  321. dag=dag)
  322. ## PIPELINE
  323. run_picard_qual_summary >> run_picard_rna_summary
  324. ## TASK
  325. run_picard_gc_summary = \
  326. DummyOperator(
  327. task_id='run_picard_gc_summary',
  328. dag=dag)
  329. ## PIPELINE
  330. run_picard_rna_summary >> run_picard_gc_summary
  331. ## TASK
  332. run_samtools_stats = \
  333. DummyOperator(
  334. task_id='run_samtools_stats',
  335. dag=dag)
  336. ## PIPELINE
  337. run_picard_gc_summary >> run_samtools_stats
  338. ## TASK
  339. run_samtools_idxstats = \
  340. DummyOperator(
  341. task_id='run_samtools_idxstats',
  342. dag=dag)
  343. ## PIPELINE
  344. run_samtools_stats >> run_samtools_idxstats
  345. ## TASK
  346. run_multiqc = \
  347. DummyOperator(
  348. task_id='run_multiqc',
  349. dag=dag)
  350. ## PIPELINE
  351. run_samtools_idxstats >> run_multiqc
  352. ## TASK
  353. upload_multiqc_to_ftp = \
  354. DummyOperator(
  355. task_id='upload_multiqc_to_ftp',
  356. dag=dag)
  357. ## PIPELINE
  358. run_multiqc >> upload_multiqc_to_ftp
  359. ## TASK
  360. upload_multiqc_to_box = \
  361. DummyOperator(
  362. task_id='upload_multiqc_to_box',
  363. dag=dag)
  364. ## PIPELINE
  365. run_multiqc >> upload_multiqc_to_box
  366. ## TASK
  367. update_analysis_and_status = \
  368. DummyOperator(
  369. task_id='update_analysis_and_status',
  370. dag=dag)
  371. ## PIPELINE
  372. upload_multiqc_to_ftp >> update_analysis_and_status
  373. upload_scanpy_report_for_sc_5p_to_ftp >> update_analysis_and_status
  374. upload_scanpy_report_for_sc_5p_to_box >> update_analysis_and_status
  375. upload_cellbrowser_for_sc_5p_to_ftp >> update_analysis_and_status
  376. upload_scanpy_report_for_vdj_to_ftp >> update_analysis_and_status
  377. upload_scanpy_report_for_vdj_to_box >> update_analysis_and_status
  378. upload_scanpy_report_for_vdj_b_to_ftp >> update_analysis_and_status
  379. upload_scanpy_report_for_vdj_b_to_box >> update_analysis_and_status
  380. upload_scanpy_report_for_vdj_t_to_ftp >> update_analysis_and_status
  381. upload_scanpy_report_for_vdj_t_to_box >> update_analysis_and_status
  382. upload_seurat_report_for_sc_5p_ftp >> update_analysis_and_status
  383. upload_seurat_report_for_sc_5p_to_box >> update_analysis_and_status
  384. upload_results_to_irods >> update_analysis_and_status
  385. upload_report_to_ftp >> update_analysis_and_status
  386. upload_report_to_box >> update_analysis_and_status
  387. upload_cram_to_irods >> update_analysis_and_status