calculations.rb

def frame_duration
  1024.0 / 44100.0 * 1000000.0
end

def get_closest_aligned_time(target_time)
  decimal_frames_to_target_time = target_time.to_f / frame_duration
  nearest_frame_index_for_target_time = decimal_frames_to_target_time.round
  puts "target_time: #{target_time}, decimal_frames_to_target_time: #{decimal_frames_to_target_time}, nearest_frame_index_for_target_time: #{nearest_frame_index_for_target_time}"
  nearest_frame_index_for_target_time * frame_duration
end

def generate_command_and_directives_for_segment(input_file, index, target_start, target_end, is_last)
  puts "--- segment #{index + 1} ---"

  start_time = get_closest_aligned_time(target_start)
  end_time = get_closest_aligned_time(target_end)
  puts "start_time: #{start_time}, end_time: #{end_time}"

  real_duration = end_time - start_time
  puts "real_duration: #{real_duration}"

  # We're subtracting two frames from the start time because ffmpeg allways internally
  # adds 2 frames of priming to the start of the stream.
  start_time_with_padding = [start_time - frame_duration * 2, 0].max

  # We add extra padding at the end, too, because ffmpeg tapers the last few frames
  # to avoid a pop when audio stops. We don't want tapering--we just want the signal.
  # So by shifting the end, we shift the taper past the content we care about it. We'll
  # chop off this tapered part using outpoint later.
  end_time_with_padding = end_time + frame_duration * 2
  puts "start_time_with_padding: #{start_time_with_padding}, end_time_with_padding: #{end_time_with_padding}"

  inpoint = 0

  if index > 0
    # We ask to also encode two frames before the start of our segment because
    # the AAC format is interframe. That is, the encoding of each frame depends
    # on the previous frame. This is also why AAC pads the start with silence.
    # By adding some extra padding ourselves, we ensure that the "real" data we
    # want will have been encoded as if the correct data preceded it. (Because
    # it did!)
    #
    # Note that, although we always set the extra time at the beginning to 2
    # frames here, it can actually be any value that's 2 frames or more. For
    # example, if you were encoding with echo, you might want to pad to account
    # for the full damping time of an echo.
    extra_time_at_beginning = frame_duration * 2
    start_time_with_padding = [start_time_with_padding - extra_time_at_beginning, 0].max

    # Although we only asked for two frames of padding, ffmpeg will add an
    # additional 2 frames of silence at the start of the segment. When we slice out
    # our real data with inpoint and outpoint, we'll want remove both the silence
    # and the extra frames we asked for.
    inpoint = frame_duration * 2 + extra_time_at_beginning
  end

  padded_duration = end_time_with_padding - start_time_with_padding
  puts "padded_duration: #{padded_duration}"

  # inpoint is inclusive and outpoint is exclusive. To avoid overlap, we subtract
  # the duration of one frame from the outpoint.
  # we don't have to subtract a frame if this is the last segment.
  subtract = frame_duration
  if is_last
    subtract = 0
  end
  outpoint = inpoint + real_duration - subtract

  # Things usually appear to work fine without the duration directive, but by
  # adding it, we make it so ffmpeg doesn't need to "guess" how long each
  # segment should be based on its sample count. Since we can do the math for
  # this at higher fidelity than ffmpeg, for very long outputs, it may help
  # avoid de-sync and make seeking more predictably exact.
  duration_directive = outpoint - inpoint + frame_duration

  puts "inpoint: #{inpoint}, outpoint: #{outpoint}"

  command =
    if ENV["NO_TRANSCODE"]
      # If we know the input file is AAC and we're not changing the sample rate,
      # we can create the segments without transcoding too. This works because,
      # if we cut at exactly the AAC frame boundaries, then we can just slice
      # out portions of the stream. Note, however, that -ss and -t flags are moved after
      # the input file so they're applied after the input file is read. Without that,
      # you'll get some funky output.
      "ffmpeg -hide_banner -loglevel error -nostats -y -i #{input_file} -c:a copy -ss #{start_time_with_padding}us -t #{padded_duration}us -f adts out/seg#{index + 1}.aac"
    else
      "ffmpeg -hide_banner -loglevel error -nostats -y -ss #{start_time_with_padding}us -t #{padded_duration}us -i #{input_file} -c:a libfdk_aac -ar 44100 -f adts out/seg#{index + 1}.aac"
    end

  directives = [
    "file 'seg#{index + 1}.aac'",
    "inpoint #{inpoint}us",
    "outpoint #{outpoint}us",
    "duration #{duration_directive}us"
  ]

  [command, directives.join("\n")]
end