download_video_from_list.py

import os
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from seleniumwire.request import Response
from webdriver_manager.chrome import ChromeDriverManager
import wget
from videoprops import get_video_properties
import multiprocessing as mp

"""
The audio and video files are seperated. To combine them, you need to install and use FFMPEG.
For Windows users, you need to install media pack so that ffmpeg can run properly.
Find your build version of windows, and download related media pack here: 
https://support.microsoft.com/en-us/topic/media-feature-pack-list-for-windows-n-editions-c1c6fffa-d052-8338-7a79-a4bb980a700a
"""

DOWNLOAD_FOLDER = "D:\\Colorful_Update\\"
NUM_PROC = 4
LIST_FILE = "vlist.txt"

def download_video(title, html_url):
    mp4_file = DOWNLOAD_FOLDER + title + '.mp4'
    mp4_tmp = DOWNLOAD_FOLDER + title + '_.mp4'
    mp3_file = DOWNLOAD_FOLDER + title + '.mp3'
    if os.path.exists(mp4_file):
        return "Exists"
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15"')
    driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=chrome_options)
    driver.get(html_url)
    driver.implicitly_wait(9)
    elems = driver.find_elements_by_xpath('//div[@class="commentList"]')
    print(title)

    video_links = {}
    max_video_size = -1
    max_video_link = None
    max_audio_size = -1
    max_audio_link = None
    for request in driver.requests:
        if request.response:
            if request.response.headers['Content-Type'] == 'video/mp4':
                new_url, _ = request.url.split('&net=')
                media_size = int(request.response.headers['Content-Range'].split('/')[-1])


                if 'media-video-avc1' in new_url:
                    media_type = 'video'
                    if media_size > max_video_size:
                        max_video_link = new_url
                        max_video_size = media_size
                else:
                    media_type = 'audio'
                    if media_size > max_audio_size:
                        max_audio_link = new_url
                        max_audio_size = media_size
                
                if new_url not in video_links:
                    video_links[new_url] = (media_type, media_size)

                
    print(video_links)
    for k, info in video_links.items():
        print("-" * 20)
        print(k)
        print(info)
        print("=" * 20)
    driver.close()
    del driver
    try:
        mp3_file = mp3_file.replace('?', '')
        mp4_tmp = mp4_tmp.replace('?', '')
        mp4_file = mp4_file.replace('?', '')
        wget.download(max_audio_link, out=mp3_file)
        wget.download(max_video_link, out=mp4_tmp)
    except Exception as e:
        print(e)
        print("Download Error", max_audio_link, max_video_link)

    if os.path.exists(mp4_tmp) and os.path.exists(mp3_file):
        os.system(f'ffmpeg -i "{mp4_tmp}" -i "{mp3_file}" -c copy "{mp4_file}"')
        os.remove(mp4_tmp)
        os.remove(mp3_file)
    
    return "Done"


def main():
    title_dict = {}

    with open(LIST_FILE, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            if len(line.strip()) == 0:
                continue
            title, url = line.split('\t')
            url = 'https://www.ixigua.com' + url.strip()
            title = title.strip()
            title_dict[url] = title

    print(title_dict)

    proceses = []
    count = 0
    items = list(title_dict.items())
    while True:
        if len(proceses) < NUM_PROC:
            url, title = items[count]
            p = mp.Process(target=download_video, args=(title, url))
            proceses.append(p)
            p.start()
            count += 1

        for idx, p in enumerate(proceses):
            if not p.is_alive():
                proceses.pop(idx)
                p.join()
                break

        if count == len(items):
            break   

    for p in proceses:
        p.join()

if __name__ == '__main__':
    main()