dag9_tenx_single_cell_immune_profiling.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. from datetime import timedelta
  2. import os,json,logging,subprocess
  3. from airflow.models import DAG,Variable
  4. from airflow.utils.dates import days_ago
  5. from airflow.operators.python_operator import PythonOperator
  6. from airflow.operators.python_operator import BranchPythonOperator
  7. from airflow.operators.dummy_operator import DummyOperator
  8. from igf_airflow.logging.upload_log_msg import send_log_to_channels,log_success,log_failure,log_sleep
  9. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import fetch_analysis_info_and_branch_func
  10. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import configure_cellranger_run_func
  11. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_sc_read_trimmming_func
  12. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_cellranger_tool
  13. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import decide_analysis_branch_func
  14. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_result_to_db_func
  15. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import ftp_files_upload_for_analysis
  16. ## ARGS
  17. default_args = {
  18. 'owner': 'airflow',
  19. 'depends_on_past': False,
  20. 'start_date': days_ago(2),
  21. 'email_on_failure': False,
  22. 'email_on_retry': False,
  23. 'retries': 4,
  24. 'max_active_runs':10,
  25. 'catchup':False,
  26. 'retry_delay': timedelta(minutes=5),
  27. 'provide_context': True,
  28. }
  29. FEATURE_TYPE_LIST = Variable.get('tenx_single_cell_immune_profiling_feature_types')
  30. ## DAG
  31. dag = \
  32. DAG(
  33. dag_id='dag9_tenx_single_cell_immune_profiling',
  34. schedule_interval=None,
  35. tags=['hpc','analysis','tenx','sc'],
  36. default_args=default_args,
  37. orientation='LR')
  38. with dag:
  39. ## TASK
  40. fetch_analysis_info_and_branch = \
  41. BranchPythonOperator(
  42. task_id='fetch_analysis_info',
  43. dag=dag,
  44. queue='hpc_4g',
  45. params={'no_analysis_task':'no_analysis',
  46. 'analysis_description_xcom_key':'analysis_description',
  47. 'analysis_info_xcom_key':'analysis_info'},
  48. python_callable=fetch_analysis_info_and_branch_func)
  49. ## TASK
  50. configure_cellranger_run = \
  51. PythonOperator(
  52. task_id='configure_cellranger_run',
  53. dag=dag,
  54. queue='hpc_4g',
  55. params={'xcom_pull_task_id':'fetch_analysis_info_and_branch',
  56. 'analysis_description_xcom_key':'analysis_description',
  57. 'analysis_info_xcom_key':'analysis_info',
  58. 'library_csv_xcom_key':'cellranger_library_csv'},
  59. python_callable=configure_cellranger_run_func)
  60. for analysis_name in FEATURE_TYPE_LIST:
  61. ## TASK
  62. task_branch = \
  63. DummyOperator(
  64. task_id=analysis_name,
  65. dag=dag)
  66. run_trim_list = list()
  67. for run_id in range(0,32):
  68. ## TASK
  69. t = \
  70. PythonOperator(
  71. task_id='run_trim_{0}_{1}'.format(analysis_name,run_id),
  72. dag=dag,
  73. params={'xcom_pull_task_id':'fetch_analysis_info_and_branch',
  74. 'analysis_info_xcom_key':'analysis_info',
  75. 'analysis_name':analysis_name,
  76. 'run_id':run_id,
  77. 'r1_length':26,
  78. 'r2_length':0,
  79. 'fastq_input_dir_tag':'fastq_dir',
  80. 'fastq_output_dir_tag':'output_path'},
  81. python_callable=run_sc_read_trimmming_func)
  82. run_trim_list.append(t)
  83. ## TASK
  84. collect_trimmed_files = \
  85. DummyOperator(
  86. task_id='collect_trimmed_files_{0}'.format(analysis_name),
  87. dag=dag)
  88. ## PIPELINE
  89. fetch_analysis_info_and_branch >> task_branch
  90. task_branch >> run_trim_list
  91. run_trim_list >> collect_trimmed_files
  92. collect_trimmed_files >> configure_cellranger_run
  93. ## TASK
  94. no_analysis = \
  95. DummyOperator(
  96. task_id='no_analysis',
  97. dag=dag)
  98. ## PIPELINE
  99. fetch_analysis_info_and_branch >> no_analysis
  100. ## TASK
  101. run_cellranger = \
  102. PythonOperator(
  103. task_id='run_cellranger',
  104. dag=dag,
  105. params={'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  106. 'analysis_description_xcom_key':'analysis_description',
  107. 'library_csv_xcom_key':'cellranger_library_csv',
  108. 'library_csv_xcom_pull_task':'configure_cellranger_run',
  109. 'cellranger_xcom_key':'cellranger_output',
  110. 'cellranger_options':['--localcores 8','--localmem 64']},
  111. python_callable=run_cellranger_tool)
  112. ## PIPELINE
  113. configure_cellranger_run >> run_cellranger
  114. ## TASK
  115. decide_analysis_branch = \
  116. BranchPythonOperator(
  117. task_id='decide_analysis_branch',
  118. dag=dag,
  119. python_callable=decide_analysis_branch_func,
  120. params={'load_cellranger_result_to_db_task':'load_cellranger_result_to_db',
  121. 'run_scanpy_for_sc_5p_task':'run_scanpy_for_sc_5p',
  122. 'run_scirpy_for_vdj_task':'run_scirpy_for_vdj',
  123. 'run_scirpy_for_vdj_b_task':'run_scirpy_for_vdj_b',
  124. 'run_scirpy_vdj_t_task':'run_scirpy_vdj_t',
  125. 'run_seurat_for_sc_5p_task':'run_seurat_for_sc_5p',
  126. 'run_picard_alignment_summary_task':'run_picard_alignment_summary',
  127. 'convert_bam_to_cram_task':'convert_bam_to_cram',
  128. 'library_csv_xcom_key':'cellranger_library_csv',
  129. 'library_csv_xcom_pull_task':'configure_cellranger_run'})
  130. ## PIPELINE
  131. run_cellranger >> decide_analysis_branch
  132. ## TASK
  133. load_cellranger_result_to_db = \
  134. PythonOperator(
  135. task_id='load_cellranger_result_to_db',
  136. dag=dag,
  137. python_callable=load_cellranger_result_to_db_func,
  138. params={'analysis_description_xcom_pull_task':'fetch_analysis_info_and_branch',
  139. 'analysis_description_xcom_key':'analysis_description',
  140. 'cellranger_xcom_key':'cellranger_output',
  141. 'cellranger_xcom_pull_task':'run_cellranger',
  142. 'collection_type':'CELLRANGER_MULTI',
  143. 'collection_table':'sample',
  144. 'xcom_collection_name_key':'sample_igf_id',
  145. 'genome_column':'genome_build',
  146. 'analysis_name':'cellranger_multi',
  147. 'output_xcom_key':'loaded_output_files',
  148. 'html_xcom_key':'html_report_file',
  149. 'html_report_file_name':'web_summary.html'})
  150. upload_report_to_ftp = \
  151. PythonOperator(
  152. task_id='upload_report_to_ftp',
  153. dag=dag,
  154. python_callable=ftp_files_upload_for_analysis,
  155. params={'xcom_pull_task':'load_cellranger_result_to_db',
  156. 'xcom_pull_files_key':'html_report_file',
  157. 'collection_name_key':'sample_igf_id',
  158. 'collection_type':'FTP_CELLRANGER_MULTI',
  159. 'collection_table':'sample',
  160. 'collect_remote_file':True})
  161. upload_report_to_box = \
  162. DummyOperator(
  163. task_id='upload_report_to_box',
  164. dag=dag,
  165. params=None)
  166. upload_results_to_irods = \
  167. DummyOperator(
  168. task_id='upload_results_to_irods',
  169. dag=dag,
  170. params=None)
  171. ## PIPELINE
  172. decide_analysis_branch >> load_cellranger_result_to_db
  173. load_cellranger_result_to_db >> upload_report_to_ftp
  174. load_cellranger_result_to_db >> upload_report_to_box
  175. load_cellranger_result_to_db >> upload_results_to_irods
  176. ## TASK
  177. run_scanpy_for_sc_5p = \
  178. DummyOperator(
  179. task_id='run_scanpy_for_sc_5p',
  180. dag=dag,
  181. params={'cellranger_xcom_key':'cellranger_output',
  182. 'cellranger_xcom_pull_task':'run_cellranger'})
  183. upload_scanpy_report_for_sc_5p_to_ftp = \
  184. DummyOperator(
  185. task_id='upload_scanpy_report_for_sc_5p_to_ftp',
  186. dag=dag)
  187. upload_scanpy_report_for_sc_5p_to_box = \
  188. DummyOperator(
  189. task_id='upload_scanpy_report_for_sc_5p_to_box',
  190. dag=dag)
  191. run_cellbrowser_for_sc_5p = \
  192. DummyOperator(
  193. task_id='run_cellbrowser_for_sc_5p',
  194. dag=dag)
  195. upload_cellbrowser_for_sc_5p_to_ftp = \
  196. DummyOperator(
  197. task_id='upload_cellbrowser_for_sc_5p_to_ftp',
  198. dag=dag)
  199. ## PIPELINE
  200. decide_analysis_branch >> run_scanpy_for_sc_5p
  201. run_scanpy_for_sc_5p >> upload_scanpy_report_for_sc_5p_to_ftp
  202. run_scanpy_for_sc_5p >> upload_scanpy_report_for_sc_5p_to_box
  203. run_scanpy_for_sc_5p >> run_cellbrowser_for_sc_5p
  204. run_cellbrowser_for_sc_5p >> upload_cellbrowser_for_sc_5p_to_ftp
  205. ## TASK
  206. run_scirpy_for_vdj = \
  207. DummyOperator(
  208. task_id='run_scirpy_for_vdj',
  209. dag=dag,
  210. params={'cellranger_xcom_key':'cellranger_output',
  211. 'cellranger_xcom_pull_task':'run_cellranger'})
  212. upload_scanpy_report_for_vdj_to_ftp = \
  213. DummyOperator(
  214. task_id='upload_scanpy_report_for_vdj_to_ftp',
  215. dag=dag)
  216. upload_scanpy_report_for_vdj_to_box = \
  217. DummyOperator(
  218. task_id='upload_scanpy_report_for_vdj_to_box',
  219. dag=dag)
  220. ## PIPELINE
  221. decide_analysis_branch >> run_scirpy_for_vdj
  222. run_scirpy_for_vdj >> upload_scanpy_report_for_vdj_to_ftp
  223. run_scirpy_for_vdj >> upload_scanpy_report_for_vdj_to_box
  224. ## TASK
  225. run_scirpy_for_vdj_b = \
  226. DummyOperator(
  227. task_id='run_scirpy_for_vdj_b',
  228. dag=dag,
  229. params={'cellranger_xcom_key':'cellranger_output',
  230. 'cellranger_xcom_pull_task':'run_cellranger'})
  231. upload_scanpy_report_for_vdj_b_to_ftp = \
  232. DummyOperator(
  233. task_id='upload_scanpy_report_for_vdj_b_to_ftp',
  234. dag=dag)
  235. upload_scanpy_report_for_vdj_b_to_box = \
  236. DummyOperator(
  237. task_id='upload_scanpy_report_for_vdj_b_to_box',
  238. dag=dag)
  239. ## PIPELINE
  240. decide_analysis_branch >> run_scirpy_for_vdj_b
  241. run_scirpy_for_vdj_b >> upload_scanpy_report_for_vdj_b_to_ftp
  242. run_scirpy_for_vdj_b >> upload_scanpy_report_for_vdj_b_to_box
  243. ## TASK
  244. run_scirpy_for_vdj_t = \
  245. DummyOperator(
  246. task_id='run_scirpy_for_vdj_t',
  247. dag=dag,
  248. params={'cellranger_xcom_key':'cellranger_output',
  249. 'cellranger_xcom_pull_task':'run_cellranger'})
  250. upload_scanpy_report_for_vdj_t_to_ftp = \
  251. DummyOperator(
  252. task_id='upload_scanpy_report_for_vdj_t_to_ftp',
  253. dag=dag)
  254. upload_scanpy_report_for_vdj_t_to_box = \
  255. DummyOperator(
  256. task_id='upload_scanpy_report_for_vdj_t_to_box',
  257. dag=dag)
  258. ## PIPELINE
  259. decide_analysis_branch >> run_scirpy_for_vdj_t
  260. run_scirpy_for_vdj_t >> upload_scanpy_report_for_vdj_t_to_ftp
  261. run_scirpy_for_vdj_t >> upload_scanpy_report_for_vdj_t_to_box
  262. ## TASK
  263. run_seurat_for_sc_5p = \
  264. DummyOperator(
  265. task_id='run_seurat_for_sc_5p',
  266. dag=dag,
  267. params={'cellranger_xcom_key':'cellranger_output',
  268. 'cellranger_xcom_pull_task':'run_cellranger'})
  269. upload_seurat_report_for_sc_5p_ftp = \
  270. DummyOperator(
  271. task_id='upload_seurat_report_for_sc_5p_ftp',
  272. dag=dag)
  273. upload_seurat_report_for_sc_5p_to_box = \
  274. DummyOperator(
  275. task_id='upload_seurat_report_for_sc_5p_to_box',
  276. dag=dag)
  277. ## PIPELINE
  278. decide_analysis_branch >> run_seurat_for_sc_5p
  279. run_seurat_for_sc_5p >> upload_seurat_report_for_sc_5p_ftp
  280. run_seurat_for_sc_5p >> upload_seurat_report_for_sc_5p_to_box
  281. ## TASK
  282. convert_bam_to_cram = \
  283. DummyOperator(
  284. task_id='convert_bam_to_cram',
  285. dag=dag,
  286. params={'cellranger_xcom_key':'cellranger_output',
  287. 'cellranger_xcom_pull_task':'run_cellranger'})
  288. upload_cram_to_irods = \
  289. DummyOperator(
  290. task_id='upload_cram_to_irods',
  291. dag=dag)
  292. ## PIPELINE
  293. decide_analysis_branch >> convert_bam_to_cram
  294. convert_bam_to_cram >> upload_cram_to_irods
  295. ## TASK
  296. run_picard_alignment_summary = \
  297. DummyOperator(
  298. task_id='run_picard_alignment_summary',
  299. dag=dag,
  300. params={'cellranger_xcom_key':'cellranger_output',
  301. 'cellranger_xcom_pull_task':'run_cellranger'})
  302. ## PIPELINE
  303. decide_analysis_branch >> run_picard_alignment_summary
  304. ## TASK
  305. run_picard_qual_summary = \
  306. DummyOperator(
  307. task_id='run_picard_qual_summary',
  308. dag=dag)
  309. ## PIPELINE
  310. run_picard_alignment_summary >> run_picard_qual_summary
  311. ## TASK
  312. run_picard_rna_summary = \
  313. DummyOperator(
  314. task_id='run_picard_rna_summary',
  315. dag=dag)
  316. ## PIPELINE
  317. run_picard_qual_summary >> run_picard_rna_summary
  318. ## TASK
  319. run_picard_gc_summary = \
  320. DummyOperator(
  321. task_id='run_picard_gc_summary',
  322. dag=dag)
  323. ## PIPELINE
  324. run_picard_rna_summary >> run_picard_gc_summary
  325. ## TASK
  326. run_samtools_stats = \
  327. DummyOperator(
  328. task_id='run_samtools_stats',
  329. dag=dag)
  330. ## PIPELINE
  331. run_picard_gc_summary >> run_samtools_stats
  332. ## TASK
  333. run_samtools_idxstats = \
  334. DummyOperator(
  335. task_id='run_samtools_idxstats',
  336. dag=dag)
  337. ## PIPELINE
  338. run_samtools_stats >> run_samtools_idxstats
  339. ## TASK
  340. run_multiqc = \
  341. DummyOperator(
  342. task_id='run_multiqc',
  343. dag=dag)
  344. ## PIPELINE
  345. run_samtools_idxstats >> run_multiqc
  346. ## TASK
  347. upload_multiqc_to_ftp = \
  348. DummyOperator(
  349. task_id='upload_multiqc_to_ftp',
  350. dag=dag)
  351. ## PIPELINE
  352. run_multiqc >> upload_multiqc_to_ftp
  353. ## TASK
  354. upload_multiqc_to_box = \
  355. DummyOperator(
  356. task_id='upload_multiqc_to_box',
  357. dag=dag)
  358. ## PIPELINE
  359. run_multiqc >> upload_multiqc_to_box
  360. ## TASK
  361. update_analysis_and_status = \
  362. DummyOperator(
  363. task_id='update_analysis_and_status',
  364. dag=dag)
  365. ## PIPELINE
  366. upload_multiqc_to_ftp >> update_analysis_and_status
  367. upload_scanpy_report_for_sc_5p_to_ftp >> update_analysis_and_status
  368. upload_scanpy_report_for_sc_5p_to_box >> update_analysis_and_status
  369. upload_cellbrowser_for_sc_5p_to_ftp >> update_analysis_and_status
  370. upload_scanpy_report_for_vdj_to_ftp >> update_analysis_and_status
  371. upload_scanpy_report_for_vdj_to_box >> update_analysis_and_status
  372. upload_scanpy_report_for_vdj_b_to_ftp >> update_analysis_and_status
  373. upload_scanpy_report_for_vdj_b_to_box >> update_analysis_and_status
  374. upload_scanpy_report_for_vdj_t_to_ftp >> update_analysis_and_status
  375. upload_scanpy_report_for_vdj_t_to_box >> update_analysis_and_status
  376. upload_seurat_report_for_sc_5p_ftp >> update_analysis_and_status
  377. upload_seurat_report_for_sc_5p_to_box >> update_analysis_and_status
  378. upload_results_to_irods >> update_analysis_and_status
  379. upload_report_to_ftp >> update_analysis_and_status
  380. upload_report_to_box >> update_analysis_and_status
  381. upload_cram_to_irods >> update_analysis_and_status