language101_scraper.py

#!/usr/bin/env python3
# language101 scraper helps you scrape full language courses from sites like
# japanesepod101.com, spanishpod101.com, chineseclass101.com and more!

import argparse
import time
from sys import exit
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

parser = argparse.ArgumentParser(description='Scrape full language courses by Innovative Language.')
parser.add_argument('-u', '--username', help='Username (email)')
parser.add_argument('-p', '--password', help='Password for the course')
parser.add_argument('--url', help='URL for the first lesson of the course')

args = parser.parse_args()

USERNAME = args.username or input('Username (email): ')
PASSWORD = args.password or input('Password: ')
COURSE_URL = args.url or input('Please insert first lesson URL of the desired course, for example:\n'
                               '* https://www.japanesepod101.com/lesson/lower-beginner-1-a-formal-japanese'
                               '-introduction/?lp=116\n '
                               '* https://www.spanishpod101.com/lesson/basic-bootcamp-1-a-pleasure-to-meet-you/?lp'
                               '=425\n '
                               '* https://www.chineseclass101.com/lesson/absolute-beginner-1-meeting-whats-your-name'
                               '/?lp=208\n')

LOGIN_DATA = {
    'amember_login': USERNAME,
    'amember_pass': PASSWORD,
}
obj = urlparse(COURSE_URL)
SOURCE_URL = f'{obj.scheme}:https://{obj.netloc}'
LOGIN_URL = f'{SOURCE_URL}/member/login_new.php'

# Login to the website with a user agent to bypass fw:
print('Establishing a new session...')
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/63.0.3239.132 Safari/537.36'})
with session:
    try:
        print(f'Trying to login to {SOURCE_URL}')
        course_response = session.post(LOGIN_URL, data=LOGIN_DATA)
        print(f'Successfully logged in as {USERNAME}')
    except Exception as e:
        print(e)
        print(
            'Login Failed, please check urls input, login details and internet connection.')
        exit(1)

    try:
        course_source = session.get(COURSE_URL)
    except Exception as e:
        print(e)
        print('Loading of course URL page failed, please make sure URL is accurate.')
        exit(1)

    # Check URL is lesson and not lesson library
    try:
        url_split = COURSE_URL.split('/')
        if url_split[3] == 'lesson-library':
            raise ValueError('\nThe supplied URL is not a lesson - it is the course contents page!\n'
                             'Please click the first lesson and try that URL.')
    except Exception as e:
        print(e)
        exit(1)
    # Creates a list of course urls which will be downloaded:
    try:
        course_soup = BeautifulSoup(course_source.text, 'lxml')
    except Exception as e:
        print(e)
        print("Failed to parse the course's webpage, 'lxml' package might be missing.")
        exit(1)

    soup_urls = course_soup.find_all('option')
    course_urls = list()

    for u in soup_urls:
        if u['value'].startswith('/lesson/'):
            course_urls.append(SOURCE_URL + u['value'])

    print('Lessons URLs successfully listed.')

    # Traverses list of course's lesson urls and downloads them:
    file_index = 1  # Used for numbering of file name strings
    for lesson_url in course_urls:
        lesson_source = session.get(lesson_url)
        lesson_soup = BeautifulSoup(lesson_source.text, 'lxml')
        audio_soup = lesson_soup.find_all('audio')

        # Downloads lesson audio files:
        if audio_soup:
            print(
                f'Downloading Lesson {str(file_index).zfill(2)} - {lesson_soup.title.text}')
            for audio_file in audio_soup:
                try:
                    file_url = audio_file['data-trackurl']
                except Exception as e:
                    print(e)
                    print(
                        'Tag "data-trackurl" was not found, trying to reach "data-url" tag instead')
                    try:
                        file_url = audio_file['data-url']
                    except Exception as e:
                        print(e)
                        print(f'Could not retrieve URL: {file_url}')
                        continue

                # Verifies that the file is 'mp3' format, if so, builds a clean str name for the file:
                if file_url.endswith('.mp3'):
                    print(f'Successfully retrieved URL: {file_url}')

                    # Creates a clean file name string with prefix, body and suffix of file name:

                    # Numbering of file using the 'file_index' variable
                    file_prefix = str(file_index).zfill(2)

                    # Main body of file name is taken from page's title
                    file_body = lesson_soup.title.text
                    # Avoids OSError: [Errno 22] while file writing:
                    invalid_chars = '\/?:*"<>|'
                    for char in invalid_chars:
                        file_body = file_body.replace(char, "")

                    file_suffix = file_url.split('/')[-1]

                    # Verifies clean version of file name by removing junk suffix string that may appear:
                    if 'dialog' in file_suffix.lower() or 'dialogue' in file_suffix.lower():
                        file_suffix = 'Dialogue'
                    elif 'review' in file_suffix.lower():
                        file_suffix = 'Review'
                    else:
                        file_suffix = 'Main Lesson'

                    file_type = '.mp3'

                    file_name = f'{file_prefix} - {file_body} - {file_suffix}{file_type}'

                    # Saves file on local folder:
                    try:
                        lesson_response = session.get(file_url)
                        with open(file_name, 'wb') as f:
                            f.write(lesson_response.content)
                            print(f'{file_name} saved on local device!')
                            print('Pausing before next file...\n')
                            time.sleep(5)
                    except Exception as e:
                        print(e)
                        print(f'Failed to save {file_name} on local device.')
                        continue
            file_index += 1

        video_soup = lesson_soup.find_all('video')

        # Downloads lesson video files:
        if video_soup:
            print(
                f'Downloading Lesson {str(file_index).zfill(2)} - {lesson_soup.title.text}')
            for video_file in video_soup:
                try:
                    file_url = video_file['data-trackurl']
                except Exception as e:
                    print(e)
                    print(
                        'Tag "data-trackurl" was not found, trying to reach "data-url" tag instead')
                    try:
                        file_url = video_file['data-url']
                    except Exception as e:
                        print(e)
                        print(f'Could not retrieve URL: {file_url}')
                        continue

                # Verifies that the file is 'mp4' or 'm4v' format, if so, builds a clean str name for the file:
                if file_url.endswith('.mp4') or file_url.endswith('.m4v'):
                    print(f'Successfully retrieved URL: {file_url}')

                    # Creates a clean file name string with prefix, body and suffix of file name:

                    # Numbering of file using the 'file_index' variable
                    file_prefix = str(file_index).zfill(2)

                    # Main body of file name is taken from page's title
                    file_body = lesson_soup.title.text
                    # Avoids OSError: [Errno 22] while file writing:
                    invalid_chars = '\/?:*"<>|'
                    for char in invalid_chars:
                        file_body = file_body.replace(char, "")

                    file_suffix = file_url.split('/')[-1]

                    # Verifies clean version of file name by removing junk suffix string that may appear:
                    if 'dialog' in file_suffix.lower() or 'dialogue' in file_suffix.lower():
                        file_suffix = 'Dialogue'
                    elif 'review' in file_suffix.lower():
                        file_suffix = 'Review'
                    else:
                        file_suffix = 'Main Lesson'

                    file_type = file_url.split('.')[-1]

                    file_name = f'{file_prefix} - {file_body} - {file_suffix}.{file_type}'

                    # Saves file on local folder:
                    try:
                        lesson_response = session.get(file_url)
                        with open(file_name, 'wb') as f:
                            f.write(lesson_response.content)
                            print(f'{file_name} saved on local device!')
                            print('Pausing before next file...\n')
                            time.sleep(5)
                    except Exception as e:
                        print(e)
                        print(f'Failed to save {file_name} on local device.')
                        continue
            file_index += 1

print('Yatta! Finished downloading the course~')