Merge pull request piskvorky#3459 from Holmes5/httpsfix

Replace HTTP with HTTPS in enwiki URLs
pabs3 · Apr 29, 2023 · 4ca0653 · 4ca0653
2 parents 525f67a + 5207fab
commit 4ca0653
Show file tree

Hide file tree

Showing 8 changed files with 12 additions and 12 deletions.
diff --git a/docs/notebooks/Word2Vec_FastText_Comparison.ipynb b/docs/notebooks/Word2Vec_FastText_Comparison.ipynb
@@ -40,7 +40,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "--2019-05-12 19:40:14-- http:https://mattmahoney.net/dc/enwik9.zip\n",
+ "--2019-05-12 19:40:14-- https:https://mattmahoney.net/dc/enwik9.zip\n",
  "Resolving mattmahoney.net (mattmahoney.net)... 67.195.197.75\n",
  "Connecting to mattmahoney.net (mattmahoney.net)|67.195.197.75|:80... connected.\n",
  "HTTP request sent, awaiting response... 200 OK\n",
@@ -51,7 +51,7 @@
  "\n",
  "2019-05-12 19:50:17 (247 KB/s) - Connection closed at byte 152553031. Retrying.\n",
  "\n",
- "--2019-05-12 19:50:18-- (try: 2) http:https://mattmahoney.net/dc/enwik9.zip\n",
+ "--2019-05-12 19:50:18-- (try: 2) https:https://mattmahoney.net/dc/enwik9.zip\n",
  "Connecting to mattmahoney.net (mattmahoney.net)|67.195.197.75|:80... connected.\n",
  "HTTP request sent, awaiting response... 206 Partial Content\n",
  "Length: 322592222 (308M), 170039191 (162M) remaining [application/zip]\n",
@@ -83,11 +83,11 @@
  "# download the text8 corpus (a 100 MB sample of cleaned wikipedia text)\n",
  "import os.path\n",
  "if not os.path.isfile('text8'):\n",
- " !wget -c http:https://mattmahoney.net/dc/text8.zip\n",
+ " !wget -c https:https://mattmahoney.net/dc/text8.zip\n",
  " !unzip text8.zip\n",
  "# download and preprocess the text9 corpus\n",
  "if not os.path.isfile('text9'):\n",
- " !wget -c http:https://mattmahoney.net/dc/enwik9.zip\n",
+ " !wget -c https:https://mattmahoney.net/dc/enwik9.zip\n",
  " !unzip enwik9.zip\n",
  " !perl {FT_HOME}wikifil.pl enwik9 > text9"
  ]

diff --git a/docs/notebooks/Wordrank_comparisons.ipynb b/docs/notebooks/Wordrank_comparisons.ipynb
@@ -62,7 +62,7 @@
  "# download the text8 corpus (a 100 MB sample of preprocessed wikipedia text)\n",
  "import os.path\n",
  "if not os.path.isfile('text8'):\n",
- " !wget -c http:https://mattmahoney.net/dc/text8.zip\n",
+ " !wget -c https:https://mattmahoney.net/dc/text8.zip\n",
  " !unzip text8.zip"
  ]
  },

diff --git a/docs/notebooks/downloader_api_tutorial.ipynb b/docs/notebooks/downloader_api_tutorial.ipynb
@@ -328,7 +328,7 @@
  " \"parts\": 3\n",
  " }, \n",
  " \"text8\": {\n",
- " \"source\": \"http:https://mattmahoney.net/dc/text8.zip\", \n",
+ " \"source\": \"https:https://mattmahoney.net/dc/text8.zip\", \n",
  " \"checksum\": \"68799af40b6bda07dfa47a32612e5364\", \n",
  " \"parts\": 1, \n",
  " \"description\": \"Cleaned small sample from wikipedia\", \n",

diff --git a/docs/notebooks/nmslibtutorial.ipynb b/docs/notebooks/nmslibtutorial.ipynb
@@ -87,7 +87,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "--2019-06-27 13:48:42-- http:https://mattmahoney.net/dc/text8.zip\n",
+ "--2019-06-27 13:48:42-- https:https://mattmahoney.net/dc/text8.zip\n",
  "Resolving mattmahoney.net... 67.195.197.75\n",
  "Connecting to mattmahoney.net|67.195.197.75|:80... connected.\n",
  "HTTP request sent, awaiting response... 200 OK\n",
@@ -106,7 +106,7 @@
  "source": [
  "import os.path\n",
  "if not os.path.isfile('text8'):\n",
- " !wget -c http:https://mattmahoney.net/dc/text8.zip\n",
+ " !wget -c https:https://mattmahoney.net/dc/text8.zip\n",
  " !unzip text8.zip"
  ]
  },

diff --git a/docs/src/auto_examples/howtos/run_downloader_api.rst b/docs/src/auto_examples/howtos/run_downloader_api.rst
@@ -335,7 +335,7 @@ Here's how to list all resources available in gensim-data:
  "checksum": "68799af40b6bda07dfa47a32612e5364",
  "file_name": "text8.gz",
  "read_more": [
- "http:https://mattmahoney.net/dc/textdata.html"
+ "https:https://mattmahoney.net/dc/textdata.html"
  ],
  "parts": 1
  },

diff --git a/gensim/downloader.py b/gensim/downloader.py
@@ -260,7 +260,7 @@ def info(name=None, show_only_latest=True, name_only=False):
  u'description': u'Cleaned small sample from wikipedia',
  u'file_name': u'text8.gz',
  u'parts': 1,
- u'source': u'http:https://mattmahoney.net/dc/text8.zip'}
+ u'source': u'https:https://mattmahoney.net/dc/text8.zip'}
  >>>
  >>> api.info() # retrieve information about all available datasets and models
 

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -2041,7 +2041,7 @@ def __iter__(self):
 
 class Text8Corpus:
  def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH):
- """Iterate over sentences from the "text8" corpus, unzipped from http:https://mattmahoney.net/dc/text8.zip."""
+ """Iterate over sentences from the "text8" corpus, unzipped from https:https://mattmahoney.net/dc/text8.zip."""
  self.fname = fname
  self.max_sentence_length = max_sentence_length
 

diff --git a/gensim/scripts/benchmark.py b/gensim/scripts/benchmark.py
@@ -30,6 +30,6 @@
  print(globals()['__doc__'] % locals())
  sys.exit(1)
 
- corpus = Text8Corpus(sys.argv[1]) # text8/text9 format from http:https://mattmahoney.net/dc/textdata.html
+ corpus = Text8Corpus(sys.argv[1]) # text8/text9 format from https:https://mattmahoney.net/dc/textdata.html
  cls = FastText
  cls(corpus, workers=12, epochs=1).save(f'/tmp/{cls.__name__}.gensim{__version__}')