We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
How to solve the code that did produce the following error:
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:1873, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id) 1866 writer = writer_class( 1867 features=writer._features, 1868 path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"), (...) 1871 embed_local_files=embed_local_files, 1872 ) -> 1873 writer.write_table(table) 1874 num_examples_progress_update += len(table) File ~/anaconda3/lib/python3.11/site-packages/datasets/arrow_writer.py:568, in ArrowWriter.write_table(self, pa_table, writer_batch_size) 567 pa_table = pa_table.combine_chunks() --> 568 pa_table = table_cast(pa_table, self._schema) 569 if self.embed_local_files: File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2290, in table_cast(table, schema) 2289 if table.schema != schema: -> 2290 return cast_table_to_schema(table, schema) 2291 elif table.schema.metadata != schema.metadata: File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2249, in cast_table_to_schema(table, schema) 2248 raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match") -> 2249 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()] 2250 return pa.Table.from_arrays(arrays, schema=schema) File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2249, in <listcomp>(.0) 2248 raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match") -> 2249 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()] 2250 return pa.Table.from_arrays(arrays, schema=schema) File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:1817, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs) 1816 if isinstance(array, pa.ChunkedArray): -> 1817 return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks]) 1818 else: File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:1817, in <listcomp>(.0) 1816 if isinstance(array, pa.ChunkedArray): -> 1817 return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks]) 1818 else: File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2109, in cast_array_to_feature(array, feature, allow_number_to_str) 2108 elif not isinstance(feature, (Sequence, dict, list, tuple)): -> 2109 return array_cast(array, feature(), allow_number_to_str=allow_number_to_str) 2110 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{feature}") File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:1819, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs) 1818 else: -> 1819 return func(array, *args, **kwargs) File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2000, in array_cast(array, pa_type, allow_number_to_str) 1999 return array.cast(pa_type) -> 2000 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{pa_type}") TypeError: Couldn't cast array of type struct<ca: string, en: string> to struct<ca: string, de: string> The above exception was the direct cause of the following exception: DatasetGenerationError Traceback (most recent call last) Cell In[1], line 9 5 cfg['num_epochs'] = 1 7 from train import train_model ----> 9 train_model(cfg) File ~/git/github/pytorch-transformer/train.py:198, in train_model(config) 195 # Make sure the weights folder exists 196 Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True) --> 198 train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config) 199 model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device) 200 # Tensorboard File ~/git/github/pytorch-transformer/train.py:143, in get_ds(config) 141 def get_ds(config): 142 # It only has the train split, so we divide it overselves --> 143 ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train') 145 # Build tokenizers 146 tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src']) File ~/anaconda3/lib/python3.11/site-packages/datasets/load.py:1797, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs) 1794 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES 1796 # Download and prepare data -> 1797 builder_instance.download_and_prepare( 1798 download_config=download_config, 1799 download_mode=download_mode, 1800 verification_mode=verification_mode, 1801 try_from_hf_gcs=try_from_hf_gcs, 1802 num_proc=num_proc, 1803 storage_options=storage_options, 1804 ) 1806 # Build dataset for splits 1807 keep_in_memory = ( 1808 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size) 1809 ) File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:890, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs) 888 if num_proc is not None: 889 prepare_split_kwargs["num_proc"] = num_proc --> 890 self._download_and_prepare( 891 dl_manager=dl_manager, 892 verification_mode=verification_mode, 893 **prepare_split_kwargs, 894 **download_and_prepare_kwargs, 895 ) 896 # Sync info 897 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values()) File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:985, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs) 981 split_dict.add(split_generator.split_info) 983 try: 984 # Prepare split will record examples associated to the split --> 985 self._prepare_split(split_generator, **prepare_split_kwargs) 986 except OSError as e: 987 raise OSError( 988 "Cannot find data file. " 989 + (self.manual_download_instructions or "") 990 + "\nOriginal error:\n" 991 + str(e) 992 ) from None File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:1746, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size) 1744 job_id = 0 1745 with pbar: -> 1746 for job_id, done, content in self._prepare_split_single( 1747 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args 1748 ): 1749 if done: 1750 result = content File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:1891, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id) 1889 if isinstance(e, SchemaInferenceError) and e.__context__ is not None: 1890 e = e.__context__ -> 1891 raise DatasetGenerationError("An error occurred while generating the dataset") from e 1893 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths) DatasetGenerationError: An error occurred while generating the dataset
----------------------------- ------------ absl-py 2.0.0 accelerate 0.28.0 aiobotocore 2.5.0 aiofiles 22.1.0 aiohttp 3.8.5 aioitertools 0.7.1 aiosignal 1.2.0 aiosqlite 0.18.0 alabaster 0.7.12 alembic 1.13.1 anaconda-anon-usage 0.4.2 anaconda-catalogs 0.2.0 anaconda-client 1.12.1 anaconda-cloud-auth 0.1.3 anaconda-navigator 2.5.0 anaconda-project 0.11.1 anyio 3.5.0 appdirs 1.4.4 applaunchservices 0.3.0 appnope 0.1.2 appscript 1.1.2 argon2-cffi 21.3.0 argon2-cffi-bindings 21.2.0 arrow 1.2.3 astroid 2.14.2 astropy 5.1 asttokens 2.0.5 astunparse 1.6.3 async-timeout 4.0.2 atomicwrites 1.4.0 attrs 22.1.0 audioread 3.0.1 Automat 20.2.0 autopep8 1.6.0 Babel 2.11.0 backcall 0.2.0 backports.functools-lru-cache 1.6.4 backports.tempfile 1.0 backports.weakref 1.0.post1 bcrypt 3.2.0 beautifulsoup4 4.12.2 binaryornot 0.4.4 black 0.0 bleach 4.1.0 bokeh 3.2.1 boltons 23.0.0 botocore 1.29.76 Bottleneck 1.3.5 brotlipy 0.7.0 cachetools 5.3.1 certifi 2023.7.22 cffi 1.15.1 chardet 4.0.0 charset-normalizer 2.0.4 click 8.0.4 cloudpickle 2.2.1 clyent 1.2.2 colorama 0.4.6 colorcet 3.0.1 colorlog 6.8.2 comm 0.1.2 conda 23.7.4 conda-build 3.26.1 conda-content-trust 0.2.0 conda_index 0.3.0 conda-libmamba-solver 23.9.1 conda-pack 0.6.0 conda-package-handling 2.2.0 conda_package_streaming 0.9.0 conda-repo-cli 1.0.75 conda-token 0.4.0 conda-verify 3.4.2 constantly 15.1.0 contourpy 1.0.5 cookiecutter 1.7.3 cryptography 41.0.3 cssselect 1.1.0 cycler 0.11.0 cytoolz 0.12.0 dask 2023.6.0 datasets 2.12.0 datashader 0.15.2 datashape 0.5.4 debugpy 1.6.7 decorator 5.1.1 defusedxml 0.7.1 diff-match-patch 20200713 diffuser 0.0.1 diffusers 0.27.0 dill 0.3.6 distributed 2023.6.0 docstring-to-markdown 0.11 docutils 0.18.1 entrypoints 0.4 et-xmlfile 1.1.0 executing 0.8.3 fastjsonschema 2.16.2 filelock 3.9.0 filterpy 1.4.5 flake8 6.0.0 Flask 2.2.2 flatbuffers 23.5.26 fonttools 4.25.0 frozenlist 1.3.3 fsspec 2023.5.0 future 0.18.3 gast 0.5.4 gensim 4.3.0 glob2 0.7 gmpy2 2.1.2 google-auth 2.23.3 google-auth-oauthlib 1.0.0 google-pasta 0.2.0 graphviz 0.20.3 greenlet 2.0.1 grpcio 1.59.0 h5py 3.9.0 HeapDict 1.0.1 holoviews 1.17.1 huggingface-hub 0.21.4 hvplot 0.8.4 hyperlink 21.0.0 idna 3.4 imagecodecs 2023.1.23 imageio 2.31.1 imagesize 1.4.1 imbalanced-learn 0.10.1 importlib-metadata 6.0.0 incremental 21.3.0 inflection 0.5.1 iniconfig 1.1.1 intake 0.6.8 intervaltree 3.1.0 ipykernel 6.25.0 ipython 8.15.0 ipython-genutils 0.2.0 ipywidgets 8.0.4 isort 5.9.3 itemadapter 0.3.0 itemloaders 1.0.4 itsdangerous 2.0.1 jaraco.classes 3.2.1 jedi 0.18.1 jellyfish 1.0.1 Jinja2 3.1.2 jinja2-time 0.2.0 jmespath 0.10.0 joblib 1.2.0 json5 0.9.6 jsonpatch 1.32 jsonpointer 2.1 jsonschema 4.17.3 jupyter 1.0.0 jupyter_client 7.4.9 jupyter-console 6.6.3 jupyter_core 5.3.0 jupyter-events 0.6.3 jupyter-server 1.23.4 jupyter_server_fileid 0.9.0 jupyter_server_ydoc 0.8.0 jupyter-ydoc 0.2.4 jupyterlab 3.6.3 jupyterlab-pygments 0.1.2 jupyterlab_server 2.22.0 jupyterlab-widgets 3.0.5 kaggle 1.5.16 kaleido 0.2.1 keras 2.14.0 keras-tuner 1.4.6 keyring 23.13.1 kiwisolver 1.4.4 kt-legacy 1.0.5 lazy_loader 0.2 lazy-object-proxy 1.6.0 libarchive-c 2.9 libclang 16.0.6 libmambapy 1.5.1 librosa 0.10.1 lightning-utilities 0.11.2 linkify-it-py 2.0.0 llvmlite 0.40.0 lmdb 1.4.1 locket 1.0.0 lxml 4.9.3 lz4 4.3.2 Mako 1.3.5 Markdown 3.4.1 markdown-it-py 2.2.0 MarkupSafe 2.1.1 matplotlib 3.7.2 matplotlib-inline 0.1.6 mccabe 0.7.0 mdit-py-plugins 0.3.0 mdurl 0.1.0 mistune 0.8.4 ml-dtypes 0.2.0 more-itertools 8.12.0 mpmath 1.3.0 msgpack 1.0.3 multidict 6.0.2 multipledispatch 0.6.0 multiprocess 0.70.14 munkres 1.1.4 mypy-extensions 1.0.0 navigator-updater 0.4.0 nbclassic 0.5.5 nbclient 0.5.13 nbconvert 6.5.4 nbformat 5.9.2 nest-asyncio 1.5.6 networkx 3.1 nltk 3.8.1 notebook 6.5.4 notebook_shim 0.2.2 numba 0.57.0 numexpr 2.8.4 numpy 1.24.3 numpydoc 1.5.0 oauthlib 3.2.2 openpyxl 3.0.10 opt-einsum 3.3.0 optuna 3.6.1 packaging 23.1 pandas 2.0.3 pandocfilters 1.5.0 panel 1.2.3 param 1.13.0 parsel 1.6.0 parso 0.8.3 partd 1.4.0 pathlib 1.0.1 pathspec 0.10.3 patsy 0.5.3 pep8 1.7.1 pexpect 4.8.0 pickleshare 0.7.5 Pillow 9.4.0 pip 23.2.1 pkce 1.0.3 pkginfo 1.9.6 platformdirs 3.10.0 plotly 5.9.0 pluggy 1.0.0 ply 3.11 pooch 1.8.0 portalocker 2.8.2 poyo 0.5.0 prometheus-client 0.14.1 prompt-toolkit 3.0.36 Protego 0.1.16 protobuf 4.24.4 psutil 5.9.0 ptyprocess 0.7.0 pure-eval 0.2.2 py-cpuinfo 8.0.0 pyarrow 11.0.0 pyasn1 0.4.8 pyasn1-modules 0.2.8 pycodestyle 2.10.0 pycosat 0.6.4 pycparser 2.21 pyct 0.5.0 pycurl 7.45.2 pydantic 1.10.8 PyDispatcher 2.0.5 pydocstyle 6.3.0 pyerfa 2.0.0 pyflakes 3.0.1 Pygments 2.15.1 PyJWT 2.4.0 pylint 2.16.2 pylint-venv 2.3.0 pyls-spyder 0.4.0 pyobjc-core 9.0 pyobjc-framework-Cocoa 9.0 pyobjc-framework-CoreServices 9.0 pyobjc-framework-FSEvents 9.0 pyodbc 4.0.34 pyOpenSSL 23.2.0 pyparsing 3.0.9 PyQt5-sip 12.11.0 pyrsistent 0.18.0 PySocks 1.7.1 pytest 7.4.0 python-dateutil 2.8.2 python-dotenv 0.21.0 python-json-logger 2.0.7 python-lsp-black 1.2.1 python-lsp-jsonrpc 1.0.0 python-lsp-server 1.7.2 python-slugify 5.0.2 python-snappy 0.6.1 pytoolconfig 1.2.5 pytz 2023.3.post1 pyviz-comms 2.3.0 PyWavelets 1.4.1 PyYAML 6.0 pyzmq 23.2.0 QDarkStyle 3.0.2 qstylizer 0.2.2 QtAwesome 1.2.2 qtconsole 5.4.2 QtPy 2.2.0 queuelib 1.5.0 regex 2022.7.9 requests 2.31.0 requests-file 1.5.1 requests-oauthlib 1.3.1 requests-toolbelt 1.0.0 resampy 0.4.2 responses 0.13.3 rfc3339-validator 0.1.4 rfc3986-validator 0.1.1 rope 1.7.0 rsa 4.9 Rtree 1.0.1 ruamel.yaml 0.17.21 ruamel-yaml-conda 0.17.21 s3fs 2023.4.0 safetensors 0.3.2 scikit-image 0.20.0 scikit-learn 1.3.0 scipy 1.11.1 Scrapy 2.8.0 seaborn 0.12.2 Send2Trash 1.8.0 service-identity 18.1.0 setuptools 68.0.0 sip 6.6.2 six 1.16.0 smart-open 5.2.1 sniffio 1.2.0 snowballstemmer 2.2.0 sortedcontainers 2.4.0 soundfile 0.12.1 soupsieve 2.4 soxr 0.3.7 Sphinx 5.0.2 sphinxcontrib-applehelp 1.0.2 sphinxcontrib-devhelp 1.0.2 sphinxcontrib-htmlhelp 2.0.0 sphinxcontrib-jsmath 1.0.1 sphinxcontrib-qthelp 1.0.3 sphinxcontrib-serializinghtml 1.1.5 spyder 5.4.3 spyder-kernels 2.4.4 SQLAlchemy 1.4.39 stack-data 0.2.0 statsmodels 0.14.0 sympy 1.11.1 tables 3.8.0 tabulate 0.8.10 tblib 1.7.0 tenacity 8.2.2 tensorboard 2.14.1 tensorboard-data-server 0.7.1 tensorflow 2.14.0 tensorflow-estimator 2.14.0 tensorflow-io 0.34.0 tensorflow-io-gcs-filesystem 0.33.0 termcolor 2.3.0 terminado 0.17.1 text-unidecode 1.3 textdistance 4.2.1 threadpoolctl 2.2.0 three-merge 0.1.1 tifffile 2023.4.12 tinycss2 1.2.1 tldextract 3.2.0 tokenizers 0.13.2 toml 0.10.2 tomlkit 0.11.1 toolz 0.12.0 torch 2.2.2 torchdata 0.7.1 torchmetrics 1.4.0.post0 torchtext 0.17.2 torchvision 0.17.2 torchviz 0.0.2 tornado 6.3.2 tqdm 4.65.0 traitlets 5.7.1 transformers 4.32.1 Twisted 22.10.0 typing_extensions 4.10.0 tzdata 2023.3 uc-micro-py 1.0.1 ujson 5.4.0 Unidecode 1.2.0 urllib3 1.26.16 w3lib 1.21.0 watchdog 2.1.6 wcwidth 0.2.5 webencodings 0.5.1 websocket-client 0.58.0 Werkzeug 2.2.3 whatthepatch 1.0.2 wheel 0.38.4 widgetsnbextension 4.0.5 wrapt 1.14.1 wurlitzer 3.0.2 xarray 2023.6.0 xlwings 0.29.1 xxhash 2.0.2 xyzservices 2022.9.0 y-py 0.5.9 yapf 0.31.0 yarl 1.8.1 ypy-websocket 0.8.2 zict 2.2.0 zipp 3.11.0 zope.interface 5.4.0 zstandard 0.19.0 ```
The text was updated successfully, but these errors were encountered:
No branches or pull requests
What I have done?
How to solve the code that did produce the following error:
Error:
My '!pip list' output in Jypyter Notebook
The text was updated successfully, but these errors were encountered: