dag14_crick_seqrun_transfer.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. from datetime import timedelta
  2. from airflow.models import DAG,Variable
  3. from airflow.utils.dates import days_ago
  4. from airflow.contrib.operators.ssh_operator import SSHOperator
  5. from airflow.contrib.hooks.ssh_hook import SSHHook
  6. from airflow.operators.python_operator import PythonOperator
  7. from airflow.operators.python_operator import BranchPythonOperator
  8. from igf_airflow.utils.dag14_crick_seqrun_transfer_utils import check_and_transfer_run_func
  9. from igf_airflow.utils.dag14_crick_seqrun_transfer_utils import extract_tar_file_func
  10. from igf_airflow.utils.dag14_crick_seqrun_transfer_utils import find_and_split_md5_func
  11. from igf_airflow.utils.dag14_crick_seqrun_transfer_utils import validate_md5_chunk_func
  12. args = {
  13. 'owner': 'airflow',
  14. 'start_date': days_ago(2),
  15. 'retries': 4,
  16. 'retry_delay': timedelta(minutes=5),
  17. 'provide_context': True,
  18. 'email_on_failure': False,
  19. 'email_on_retry': False,
  20. 'catchup': False,
  21. 'max_active_runs': 5,
  22. }
  23. ## SSH HOOK
  24. orwell_ssh_hook = \
  25. SSHHook(
  26. key_file=Variable.get('hpc_ssh_key_file'),
  27. username=Variable.get('hpc_user'),
  28. remote_host=Variable.get('orwell_server_hostname'))
  29. dag = \
  30. DAG(
  31. dag_id='dag14_crick_seqrun_transfer',
  32. schedule_interval=None,
  33. default_args=args,
  34. tags=['ftp', 'hpc', 'orwell', 'wells'])
  35. with dag:
  36. ## TASK
  37. # not working on HPC
  38. check_and_transfer_run = \
  39. PythonOperator(
  40. task_id='check_and_transfer_run',
  41. dag=dag,
  42. pool='crick_ftp_pool',
  43. queue='hpc_4G',
  44. python_callable=check_and_transfer_run_func)
  45. ## TASK
  46. extract_tar_file = \
  47. PythonOperator(
  48. task_id='extract_tar_file',
  49. dag=dag,
  50. queue='hpc_4G',
  51. python_callable=extract_tar_file_func)
  52. ## TASK
  53. find_and_split_md5 = \
  54. BranchPythonOperator(
  55. task_id='find_and_split_md5',
  56. dag=dag,
  57. queue='hpc_4G',
  58. python_callable=find_and_split_md5_func)
  59. ## PIPELINE
  60. check_and_transfer_run >> extract_tar_file >> find_and_split_md5
  61. for chunk_id in range(0, 21):
  62. t = \
  63. PythonOperator(
  64. task_id='md5_validate_chunk_{0}'.format(i),
  65. dag=dag,
  66. queue='hpc_4G',
  67. params={'chunk_id': chunk_id,
  68. 'xcom_task': 'find_and_split_md5',
  69. 'xcom_key': 'md5_file_chunk'},
  70. python_callable=validate_md5_chunk_func)
  71. ## PIPELINE
  72. find_and_split_md5 >> t