#![allow(clippy::undocumented_unsafe_blocks)]

use std::{
    env, mem,
    os::{
        fd::{AsRawFd, FromRawFd, OwnedFd, RawFd},
        unix::ffi::OsStrExt,
    },
    ptr,
};

use btoi::btoi;
use libloading::os::unix::Symbol;
use nix::{
    errno::Errno,
    libc,
    sys::signal::{kill, sigprocmask, SigSet, SigmaskHow, Signal},
    unistd::{close, getpid, read, setsid, tcsetpgrp, write},
};

use crate::{
    caps,
    config::*,
    confine::{confine_scmp_ioctl, confine_scmp_pwritev2, safe_drop_cap},
    fs::retry_on_eintr,
    landlock::{CompatLevel, Compatible, RestrictSelfFlags, Ruleset, RulesetAttr, Scope},
    unshare::{
        error::ErrorCode as Err,
        run::{ChildInfo, Exe},
    },
};

unsafe fn fail_errno(code: Err, errno: i32) -> ! {
    let msg = match code {
        Err::CapSet => c"syd: capset error".as_ptr(),
        Err::Exec => c"syd: exec error".as_ptr(),
        Err::ParentDeathSignal => c"syd: parent-death-signal error".as_ptr(),
        Err::PreExec => c"syd: pre-exec error".as_ptr(),
        Err::ProcessStop => c"syd: error stopping process".as_ptr(),
        Err::ResetSignal => c"syd: error reseting signals".as_ptr(),
        Err::Seccomp => c"syd: seccomp error".as_ptr(),
        Err::SeccompFilterIoctl => c"syd: seccomp filter ioctl error".as_ptr(),
        Err::SeccompFilterAppendOnly => c"syd: seccomp filter pwritev2 error".as_ptr(),
        Err::SeccompSendFd => c"syd: seccomp send notify-fd error".as_ptr(),
        Err::SeccompWaitFd => c"syd: seccomp wait for notify-fd error".as_ptr(),
        Err::SetSid => c"syd: setsid error".as_ptr(),
        Err::SetPty => c"syd: error setting pty as controlling terminal".as_ptr(),
        Err::DupPty => c"syd: error duplicating pty onto stdio fds".as_ptr(),
        Err::SetPgrp => c"syd: error setting foreground process group".as_ptr(),
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        Err::SetTSC => c"syd: set-tsc error".as_ptr(),
    };
    Errno::set_raw(errno);
    libc::perror(msg as *const libc::c_char);
    libc::_exit(errno);
}

macro_rules! fail_safe {
    ($child:expr, $error:expr) => {
        let errno = Errno::last_raw();
        unsafe { fail_errno($error, errno) }
    };
}

macro_rules! fail_errno_safe {
    ($child:expr, $error:expr, $errno:expr) => {
        unsafe { fail_errno($error, $errno) }
    };
}

#[allow(clippy::cognitive_complexity)]
pub extern "C" fn child_after_clone(arg: *mut libc::c_void) -> libc::c_int {
    // SAFETY: arg is a valid ChildInfo structure.
    let mut child: Box<ChildInfo> = unsafe { Box::from_raw(arg as *mut ChildInfo) };

    // Restriction 0: Change controlling terminal to PTY as necessary.
    if let Some(pty_fd) = child.pty_fd.take() {
        // SAFETY: pty_fd is a valid FD.
        let pty_fd = unsafe { OwnedFd::from_raw_fd(pty_fd) };

        // Become session leader so we can take a controlling TTY.
        if let Err(errno) = setsid() {
            fail_errno_safe!(child, Err::SetSid, errno as i32);
        }

        // Make the PTY fd our controlling terminal.
        if let Err(errno) =
            Errno::result(unsafe { libc::ioctl(pty_fd.as_raw_fd(), libc::TIOCSCTTY, 0) })
        {
            fail_errno_safe!(child, Err::SetPty, errno as i32);
        }

        // Make us the foreground process group.
        if let Err(errno) = tcsetpgrp(&pty_fd, getpid()) {
            fail_errno_safe!(child, Err::SetPgrp, errno as i32);
        }

        // Duplicate PTY fd onto stdio(3) fds.
        for std_fd in [libc::STDIN_FILENO, libc::STDOUT_FILENO, libc::STDERR_FILENO] {
            if let Err(errno) = Errno::result(unsafe { libc::dup2(pty_fd.as_raw_fd(), std_fd) }) {
                fail_errno_safe!(child, Err::DupPty, errno as i32);
            }
        }

        // Close the original PTY fd.
        drop(pty_fd);
    }

    // Restriction 1:
    //
    // Apply a Landlock scope sandbox to restrict
    // 1. Ptrace attach outside Landlock.
    // 2. Signal send outside Landlock.
    // 3. UNIX abstract socket connect outside Landlock.
    // We leave path and network restrictions for Landlock
    // to be configured by the user using Lock sandboxing.
    // This is an added layer for hardening and it's best-effort.
    let mut ruleset = Ruleset::default();
    let ruleset_ref = &mut ruleset;
    ruleset_ref.set_compatibility(CompatLevel::BestEffort);
    let _ = ruleset_ref.scope(Scope::AbstractUnixSocket);
    let _ = ruleset_ref.scope(Scope::Signal);
    if let Ok(ruleset) = ruleset.create() {
        let _ = ruleset.restrict_self(RestrictSelfFlags::empty());
    }

    // Restriction 2:
    //
    // Add per-architecture seccomp(2) filters to deny unsafe ioctl(2) requests.
    if let Some(denylist) = child.ioctl_denylist.take() {
        if let Err(error) = confine_scmp_ioctl(&denylist, child.cfg.ssb) {
            let errno = error.errno().unwrap_or(Errno::ENOSYS);
            fail_errno_safe!(child, Err::SeccompFilterIoctl, errno as i32);
        }
    }

    // Restriction 3:
    //
    // Deny RWF_NOAPPEND for pwritev2(2) if append-only is enabled.
    if child.cfg.append_only {
        if let Err(error) = confine_scmp_pwritev2(child.cfg.ssb) {
            let errno = error.errno().unwrap_or(Errno::ENOSYS);
            fail_errno_safe!(child, Err::SeccompFilterAppendOnly, errno as i32);
        }
    }

    // SAFETY: Do not leak the static file descriptors to the sandbox process.
    proc_close();

    // SAFETY: Do not leak the following FDs to the sandbox process:
    // 1. Log file descriptor.
    // 2. IPC epoll file descriptor.
    // 3. IPC UNIX socket descriptor.
    // TODO: Move this to config.rs.
    const CLOSE_FD_ENVS: &[&str] = &[ENV_LOG_FD, ENV_IPC_POLL_FD, ENV_IPC_UNIX_FD];
    for env in CLOSE_FD_ENVS {
        let fd = if let Some(fd) = env::var_os(env) {
            btoi::<RawFd>(fd.as_bytes()).ok()
        } else {
            None
        };
        if let Some(fd) = fd {
            let _ = close(fd);
        }
    }

    // SAFETY: Clean Syd environment variables from process environment.
    // Note, we have just used ENV_LOG_FD above and do not need it anymore.
    for (key, _) in env::vars_os() {
        if key.as_bytes().starts_with(b"CARGO_BIN_EXE_syd")
            || (key.as_bytes().starts_with(b"SYD_") && !key.as_bytes().starts_with(b"SYD_TEST_"))
        {
            env::remove_var(key);
        }
    }

    // We'll write seccomp notify fd to the second pipe,
    // and read the acknowledgement notification from
    // the first pipe.
    let (pipe_ro, pipe_rw) = (child.seccomp_pipefd.0 .0, child.seccomp_pipefd.1 .1);

    // Close the unused ends of the pipes.
    drop(child.seccomp_pipefd.0 .1);
    drop(child.seccomp_pipefd.1 .0);

    if let Some(&sig) = child.cfg.death_sig.as_ref() {
        if let Err(errno) = Errno::result(unsafe {
            libc::prctl(libc::PR_SET_PDEATHSIG, sig as libc::c_ulong, 0, 0, 0)
        }) {
            fail_errno_safe!(child, Err::ParentDeathSignal, errno as i32);
        }
    }

    if child.cfg.restore_sigmask {
        // Reset blocking signals.
        // Step 1: Reset the signal mask using pthread_sigmask.
        unsafe {
            let mut sigmask: libc::sigset_t = mem::zeroed();
            libc::sigemptyset(&mut sigmask);
            libc::pthread_sigmask(libc::SIG_SETMASK, &sigmask, ptr::null_mut());
        }
        // Step 2: Unblock all signals using sigprocmask.
        let sigmask = SigSet::all();
        if let Err(errno) = sigprocmask(SigmaskHow::SIG_UNBLOCK, Some(&sigmask), None) {
            fail_errno_safe!(child, Err::ResetSignal, errno as i32);
        }

        // Reset all signals to their default dispositions.
        if let Err(errno) = crate::reset_signals() {
            fail_errno_safe!(child, Err::ResetSignal, errno as i32);
        }
    }

    if let Some(callback) = &child.pre_exec {
        if let Err(errno) = callback() {
            fail_errno_safe!(child, Err::PreExec, errno as i32);
        }
    }

    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    if child.cfg.deny_tsc {
        if let Err(errno) =
            Errno::result(unsafe { libc::prctl(libc::PR_SET_TSC, libc::PR_TSC_SIGSEGV) })
        {
            fail_errno_safe!(child, Err::SetTSC, errno as i32);
        }
    }

    if child.cfg.stop {
        // Stop the process to give the parent a chance to seize us and set ptrace options.
        // This must happen _before_ loading the seccomp filter.
        if let Err(errno) = kill(getpid(), Signal::SIGSTOP) {
            fail_errno_safe!(child, Err::ProcessStop, errno as i32);
        }
    }

    if let Some(seccomp_filter) = child.seccomp_filter {
        // Load the seccomp filter.
        if let Err(scmp_err) = seccomp_filter.load() {
            fail_errno_safe!(
                child,
                Err::Seccomp,
                scmp_err
                    .sysrawrc()
                    .map(|errno| errno.abs())
                    .unwrap_or_else(|| Errno::last() as i32)
            );
        }

        // Get seccomp notification fd.
        let seccomp_fd = match seccomp_filter.get_notify_fd() {
            Ok(fd) => {
                // SAFETY: get_notify_fd returns a valid FD.
                unsafe { OwnedFd::from_raw_fd(fd) }
            }
            Err(scmp_err) => fail_errno_safe!(
                child,
                Err::Seccomp,
                scmp_err
                    .sysrawrc()
                    .map(|errno| errno.abs())
                    .unwrap_or_else(|| Errno::last() as i32)
            ),
        };

        // Write the value of the seccomp notify fd to the pipe.
        // Handle partial writes and interrupts.
        // EOF means parent died before reading.
        let fd = seccomp_fd.as_raw_fd().to_le_bytes();
        let mut nwrite = 0;
        while nwrite < fd.len() {
            #[allow(clippy::arithmetic_side_effects)]
            match retry_on_eintr(|| write(&pipe_rw, &fd[nwrite..])) {
                Ok(0) => {
                    // Parent died before reading.
                    // This should ideally never happen.
                    fail_errno_safe!(child, Err::SeccompSendFd, Errno::EIO as i32);
                }
                Ok(n) => nwrite += n,
                Err(errno) => fail_errno_safe!(child, Err::SeccompSendFd, errno as i32),
            }
        }

        // Close the write end of the pipe.
        drop(pipe_rw);

        // Wait for the parent to get the file descriptor.
        // Handle interrupts.
        // Partial read is not possible.
        // EOF means parent died before writing to the pipe.
        let mut buf = [0u8; 1];
        match retry_on_eintr(|| read(&pipe_ro, &mut buf[..])) {
            Ok(0) => {
                // Parent died before writing.
                // This should ideally never happen.
                fail_errno_safe!(child, Err::SeccompWaitFd, Errno::EIO as i32);
            }
            Ok(1) if buf[0] == 42 => {
                // Parent received seccomp fd successfully.
                // We can go ahead and close our copy now.
            }
            Ok(_) => unreachable!("BUG: The meaning of life is not {:#x}!", buf[0]),
            Err(errno) => fail_errno_safe!(child, Err::SeccompWaitFd, errno as i32),
        }

        // Close our copy of the seccomp-notify fd.
        // Parent process has already acknowledged that
        // it has received a copy of this fd.
        drop(seccomp_fd);

        // Release resources for seccomp BPF filter.
        // Memory allocation/deallocation is OK here
        // now that we have transferred over the
        // seccomp-notify fd to the parent process.
        // Otherwise we'd risk breaking Memory sandboxing.
        drop(seccomp_filter);

        // Close the read end of the pipe.
        drop(pipe_ro);
    } else {
        // Close unused ends of the pipes.
        drop(pipe_ro);
        drop(pipe_rw);
    }

    // Drop CAP_SYS_PTRACE late as Syd may need it.
    if !child.cfg.keep && safe_drop_cap(caps::Capability::CAP_SYS_PTRACE).is_err() {
        fail_safe!(child, Err::CapSet);
    }
    if caps::securebits::set_keepcaps(true).is_err() {
        fail_safe!(child, Err::CapSet);
    }

    // TODO:
    // Set the new secure bits:
    // 1. SECBIT_EXEC_RESTRICT_FILE
    // 2. SECBIT_EXEC_DENY_INTERACTIVE
    // upon user configuration.
    // See: https://docs.kernel.org/userspace-api/check_exec.html
    // Note, we already use AT_EXECVE_CHECK.

    match child.exe {
        Exe::Library(lib) => unsafe {
            let fun: Symbol<unsafe extern "C" fn() -> i32> = match lib.get(b"syd_main") {
                Ok(fun) => fun,
                Err(_) => nix::libc::_exit(nix::libc::EINVAL),
            };
            nix::libc::_exit(fun());
        },
        Exe::Program((filename, ref args)) => {
            let args = &args[..];
            unsafe { libc::execvp(filename, args.as_ptr()) };
            fail_safe!(child, Err::Exec);
        }
    }
}
