{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1546ea83-a266-4f13-8eb8-d5cc6f72ee56",
   "metadata": {},
   "source": [
    "# Batch Downloads"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6ebc1f0-ff62-4a44-be62-7860e5a55155",
   "metadata": {},
   "source": [
    "his notebook shows you how to download multiple datacubes. It is not needed if you "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "49765cb5-4f3e-41a2-b7b1-157a8b1bfae0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os.path as op\n",
    "from joblib import Parallel, delayed\n",
    "import numpy as np\n",
    "from astropy.table import Column, Table, hstack, unique\n",
    "\n",
    "from astropy.coordinates import SkyCoord\n",
    "import astropy.units as u\n",
    "from astropy.io import fits\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.colors import LogNorm\n",
    "\n",
    "from tqdm import tqdm\n",
    "import traceback\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "12f8ce30-0650-44ff-800c-2ebd942effde",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Follow notebook 1 to access the IFU index file and point to the directory it is located in\n",
    "pdr_dir = '/home/jovyan/Hobby-Eberly-Public/HETDEX/pdr/pdr1/'\n",
    "if not op.exists(pdr_dir):\n",
    "    pdr_dir = '/home/jovyan/work/pdr1/'\n",
    "\n",
    "ifu_data = Table.read(op.join( pdr_dir, 'ifu-index.fits'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "48016742-7556-4d72-ab6d-89fcf010d338",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create SkyCoord Object array for IFU centers\n",
    "ifu_coords = SkyCoord( ra=ifu_data['ra_cen']*u.deg, dec=ifu_data['dec_cen']*u.deg)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f073ba4-1c82-4bcc-9266-34b1c1d648b7",
   "metadata": {},
   "source": [
    "## Example Download Script for a single coordinate with multiple observations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "07663777-02d0-4581-b35b-4c73a84d9d7f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(20170326007, '096'), (20181118020, '036'), (20181119015, '036'), (20181119016, '036'), (20181120012, '036'), (20181120013, '036'), (20181215031, '036'), (20190104018, '036'), (20190105022, '036'), (20190108013, '036'), (20190112023, '036'), (20210307020, '040'), (20220130020, '024'), (20220131010, '040'), (20230123017, '023'), (20240310015, '070'), (20240313016, '070')]\n"
     ]
    }
   ],
   "source": [
    "coord = SkyCoord(ra=150.23189*u.deg, dec=2.363963*u.deg)\n",
    "\n",
    "# find list of possible datacubes with coverage\n",
    "sel = coord.separation( ifu_coords) < 37*u.arcsec\n",
    "# this AGN is observed 9 times\n",
    "ifulist=ifu_data[sel]\n",
    "ifu_pairs = [(int(row['shotid']), str(row['ifuslot'])) for row in ifulist]\n",
    "print(ifu_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "833dec60-f135-4729-9dec-958ce85e4fca",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create list of URLS to datacubes of interest. Saves to the file urls.txt. Then you can either download in this notebook, or use wget in a unix/terminal window."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d5b4595e-5bc2-480d-9493-f8eec71abc2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Base URL\n",
    "base_url = \"http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes\"\n",
    "\n",
    "# Write URLs to file\n",
    "\n",
    "urls = []\n",
    "with open(\"urls.txt\", \"w\") as f:\n",
    "    for shotid, ifuslot in ifu_pairs:\n",
    "        filename = f\"dex_cube_{shotid}_{ifuslot}.fits\"\n",
    "        url = f\"{base_url}/{shotid}/{filename}\"\n",
    "        urls.append(url)\n",
    "        f.write(url + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f52a4d7d-62bd-4e69-8ee3-a5c751c72d8f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20170326007/dex_cube_20170326007_096.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20181118020/dex_cube_20181118020_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20181119015/dex_cube_20181119015_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20181119016/dex_cube_20181119016_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20181120012/dex_cube_20181120012_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20181120013/dex_cube_20181120013_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20181215031/dex_cube_20181215031_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20190104018/dex_cube_20190104018_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20190105022/dex_cube_20190105022_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20190108013/dex_cube_20190108013_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20190112023/dex_cube_20190112023_036.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20210307020/dex_cube_20210307020_040.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20220130020/dex_cube_20220130020_024.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20220131010/dex_cube_20220131010_040.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20230123017/dex_cube_20230123017_023.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20240310015/dex_cube_20240310015_070.fits',\n",
       " 'http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes/20240313016/dex_cube_20240313016_070.fits']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urls"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eb61cc06-4a35-4b58-89f6-553833bd788a",
   "metadata": {},
   "source": [
    "Now that you have the url list you can either download in a notebook. Or if you prefer opening a ssh/terminal window, you can use wget. Example wget command to download the files, preserving the pdr1 data file structure so all notebooks will work provided you point to the top level directory where this command is performed. Ideally same directory as ifu-index.fits\n",
    "\n",
    "    wget --cut-dirs=4 -nH -x -i urls.txt\n",
    "\n",
    "If you would like to download in parallel you can use this unix function\n",
    "\n",
    "    cat urls.txt | xargs -n 1 -P 4 wget  --cut-dirs=4 -nH -x"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7396e2c5-7748-45a2-887f-96afb035409f",
   "metadata": {},
   "source": [
    "Or if you prefer you can use this code to download within this JupyterNotebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "80d3749e-8227-4406-afb1-a0b497317af8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading: datacubes/20170326007/dex_cube_20170326007_096.fits\n",
      "Downloading: datacubes/20181118020/dex_cube_20181118020_036.fits\n",
      "Downloading: datacubes/20181119015/dex_cube_20181119015_036.fits\n",
      "Downloading: datacubes/20181119016/dex_cube_20181119016_036.fits\n",
      "Downloading: datacubes/20181120012/dex_cube_20181120012_036.fits\n",
      "Downloading: datacubes/20181120013/dex_cube_20181120013_036.fits\n",
      "Downloading: datacubes/20181215031/dex_cube_20181215031_036.fits\n",
      "Downloading: datacubes/20190104018/dex_cube_20190104018_036.fits\n",
      "Downloading: datacubes/20190105022/dex_cube_20190105022_036.fits\n",
      "Downloading: datacubes/20190108013/dex_cube_20190108013_036.fits\n",
      "Downloading: datacubes/20190112023/dex_cube_20190112023_036.fits\n",
      "Downloading: datacubes/20210307020/dex_cube_20210307020_040.fits\n",
      "Downloading: datacubes/20220130020/dex_cube_20220130020_024.fits\n",
      "Downloading: datacubes/20220131010/dex_cube_20220131010_040.fits\n",
      "Downloading: datacubes/20230123017/dex_cube_20230123017_023.fits\n",
      "Downloading: datacubes/20240310015/dex_cube_20240310015_070.fits\n",
      "Downloading: datacubes/20240313016/dex_cube_20240313016_070.fits\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import subprocess\n",
    "from urllib.parse import urlsplit\n",
    "\n",
    "pdr_dir = '/home/jovyan/work/pdr1'\n",
    "base_url_path = '/hetdex/HETDEX/pdr/pdr1/'  # everything before this will be cut\n",
    "\n",
    "\n",
    "for url in urls:\n",
    "    path = urlsplit(url).path\n",
    "    rel_path = path.split(base_url_path, 1)[-1]  # Extract relative path under pdr1/\n",
    "    local_path = os.path.join(pdr_dir, rel_path)\n",
    "\n",
    "    # Ensure directory exists\n",
    "    os.makedirs(os.path.dirname(local_path), exist_ok=True)\n",
    "\n",
    "    # Download if not present\n",
    "    if os.path.exists(local_path):\n",
    "        print(f\"Skipping existing file: {rel_path}\")\n",
    "        continue\n",
    "\n",
    "    print(f\"Downloading: {rel_path}\")\n",
    "    subprocess.run([\n",
    "        'wget', '-q',\n",
    "        '-O', local_path,\n",
    "        url\n",
    "    ])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d46134ca-821b-4b5a-971a-101ccc311fd2",
   "metadata": {},
   "source": [
    "## Example Downloading on a long coordinate list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "85ab3f3b-100e-49f8-b953-934acf211292",
   "metadata": {},
   "outputs": [],
   "source": [
    "catalog = Table.read('dr16q_hdr5.fits')\n",
    "catalog.remove_column('shotid') # this example might be done witih different catalog later\n",
    "catalog_coords = SkyCoord(ra = catalog['ra'], dec= catalog['dec'], unit='deg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "402f323a-390f-406b-8e27-a990c7b71cc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx_ifu, idx_catalog, d2d, d3d = catalog_coords.search_around_sky(ifu_coords, 37*u.arcsec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "02f3d99f-024d-4ac8-90e2-c1dd58b82299",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create master table that matches IFU observation coverage with catalog objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e8d0492e-4578-4837-ae0d-6035076099dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "table = hstack( [catalog_coords[idx_catalog], catalog[idx_catalog], ifu_data[idx_ifu]] )\n",
    "table.rename_column('col0', 'coords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "9cad7c27-59a0-45fa-96bc-a01cc6d73f8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# reduce cube list to unique IFU observations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3d6b2254-b080-493d-898d-887369cc6c62",
   "metadata": {},
   "outputs": [],
   "source": [
    "ifulist = unique( table['shotid', 'ifuslot'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "160bd9fd-21ba-4011-a8a9-aaaf1bb607f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make sure 'ifuslot' is a string to preserve naming like '034'\n",
    "ifu_pairs = [(int(row['shotid']), str(row['ifuslot'])) for row in ifulist]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "fc25cf8a-c817-4f17-911a-fbbca834965a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Base URL\n",
    "base_url = \"http://web.corral.tacc.utexas.edu/hetdex/HETDEX/pdr/pdr1/datacubes\"\n",
    "\n",
    "# Write URLs to file\n",
    "\n",
    "urls = []\n",
    "with open(\"urls.txt\", \"w\") as f:\n",
    "    for shotid, ifuslot in ifu_pairs:\n",
    "        filename = f\"dex_cube_{shotid}_{ifuslot}.fits\"\n",
    "        url = f\"{base_url}/{shotid}/{filename}\"\n",
    "        urls.append(url)\n",
    "        f.write(url + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "a0bb28b9-0f4f-4066-91d0-f93eb974146a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of datacubes to download is 8124. This is approximately 219.35 Gb.\n"
     ]
    }
   ],
   "source": [
    "print('Number of datacubes to download is {}. This is approximately {:5.2f} Gb.'.format( len( urls) , len(urls)*0.027))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4666a7e5-d57d-4b54-8d38-e58932ea54f3",
   "metadata": {},
   "source": [
    "We recommend using wget for large downloads. \n",
    "\n",
    "    wget --cut-dirs=4 -nH -x -i urls.txt\n",
    "\n",
    "You can also run in parallel. Change -P 4 to more cores but too many and you will have server issues.\n",
    "\n",
    "    cat urls.txt | xargs -n 1 -P 4 wget --cut-dirs=4 -nH -x"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}