Metadata-Version: 2.4
Name: data_prep_toolkit_transforms
Version: 1.1.1.dev1
Summary: Data Preparation Toolkit Transforms using Ray
Author-email: Maroun Touma <touma@us.ibm.com>
License: Apache-2.0
Keywords: transforms,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
Requires-Python: <3.13,>=3.10
Description-Content-Type: text/markdown
Requires-Dist: data-prep-toolkit==0.2.5.dev1
Provides-Extra: dev
Requires-Dist: twine; extra == "dev"
Requires-Dist: pytest>=7.3.2; extra == "dev"
Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
Requires-Dist: pytest-env>=1.0.0; extra == "dev"
Requires-Dist: pre-commit>=3.3.2; extra == "dev"
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
Requires-Dist: moto==5.0.5; extra == "dev"
Requires-Dist: markupsafe==2.0.1; extra == "dev"
Provides-Extra: ray
Requires-Dist: data-prep-toolkit[ray]>=0.2.5.dev1; extra == "ray"
Requires-Dist: networkx==3.3; extra == "ray"
Requires-Dist: colorlog==6.8.2; extra == "ray"
Requires-Dist: func-timeout==4.3.5; extra == "ray"
Requires-Dist: emerge-viz==2.0.0; extra == "ray"
Provides-Extra: all
Requires-Dist: data-prep-toolkit>=0.2.5.dev0; extra == "all"
Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin" and extra == "all"
Requires-Dist: timeout-timer==0.2.0; extra == "all"
Requires-Dist: data-prep-toolkit>=0.2.5.dev0; extra == "all"
Requires-Dist: bs4>=0.0.2; extra == "all"
Requires-Dist: transformers>=4.38.2; extra == "all"
Requires-Dist: pandas; extra == "all"
Requires-Dist: click; extra == "all"
Requires-Dist: httpx-sse; extra == "all"
Requires-Dist: ibm-generative-ai; extra == "all"
Requires-Dist: matplotlib; extra == "all"
Requires-Dist: matplotlib-inline; extra == "all"
Requires-Dist: networkx; extra == "all"
Requires-Dist: numpy; extra == "all"
Requires-Dist: plotly; extra == "all"
Requires-Dist: pyarrow; extra == "all"
Requires-Dist: streamlit; extra == "all"
Requires-Dist: tree-sitter==0.21.3; extra == "all"
Requires-Dist: tree-sitter-languages==1.10.2; extra == "all"
Requires-Dist: uuid; extra == "all"
Requires-Dist: sentence_transformers; extra == "all"
Requires-Dist: clamd==1.0.2; extra == "all"
Requires-Dist: networkx==3.3; extra == "all"
Requires-Dist: colorlog==6.8.2; extra == "all"
Requires-Dist: func-timeout==4.3.5; extra == "all"
Requires-Dist: pandas==2.2.2; extra == "all"
Requires-Dist: emerge-viz==2.0.0; extra == "all"
Requires-Dist: presidio-analyzer>=2.2.355; extra == "all"
Requires-Dist: presidio-anonymizer>=2.2.355; extra == "all"
Requires-Dist: flair>=0.14.0; extra == "all"
Requires-Dist: pandas; extra == "all"
Requires-Dist: fasttext-wheel; extra == "all"
Requires-Dist: langcodes>=3.3.0; extra == "all"
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4; extra == "all"
Requires-Dist: numpy==1.26.4; extra == "all"
Requires-Dist: docling-core==2.21.2; extra == "all"
Requires-Dist: docling-ibm-models==3.4.1; extra == "all"
Requires-Dist: docling-parse==3.4.0; extra == "all"
Requires-Dist: docling==2.25.1; extra == "all"
Requires-Dist: filetype<2.0.0,>=1.2.0; extra == "all"
Requires-Dist: docling-core>=2.18.0; extra == "all"
Requires-Dist: pydantic>=2.0.0; extra == "all"
Requires-Dist: llama-index-core<0.12.0,>=0.11.22; extra == "all"
Requires-Dist: sentence-transformers>=3.0.1; extra == "all"
Requires-Dist: nltk>=3.9.1; extra == "all"
Requires-Dist: transformers>=4.38.2; extra == "all"
Requires-Dist: pandas; extra == "all"
Requires-Dist: requests; extra == "all"
Requires-Dist: polars>=1.9.0; extra == "all"
Requires-Dist: textstat; extra == "all"
Requires-Dist: pandas; extra == "all"
Requires-Dist: fasttext-wheel; extra == "all"
Requires-Dist: langcodes>=3.5.0; extra == "all"
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4; extra == "all"
Requires-Dist: numpy<1.29.0,>=1.26.4; extra == "all"
Requires-Dist: mmh3==4.1.0; extra == "all"
Requires-Dist: xxhash==3.4.1; extra == "all"
Requires-Dist: duckdb>=0.10.1; extra == "all"
Requires-Dist: pandas>=2.2.0; extra == "all"
Requires-Dist: mmh3>=4.1.0; extra == "all"
Requires-Dist: xxhash==3.4.1; extra == "all"
Requires-Dist: pyyaml>=6.0.2; extra == "all"
Requires-Dist: boto3; extra == "all"
Requires-Dist: kubernetes>=30.1.0; extra == "all"
Requires-Dist: polars!=1.10.0,!=1.11.0,!=1.12.0,>=1.9.0; extra == "all"
Requires-Dist: disjoint-set>=0.8.0; extra == "all"
Requires-Dist: scipy<2.0.0,>=1.12.1; extra == "all"
Requires-Dist: numpy<1.29.0; extra == "all"
Requires-Dist: sentencepiece>=0.2.0; extra == "all"
Requires-Dist: mmh3>=4.1.0; extra == "all"
Requires-Dist: nltk==3.9.1; extra == "all"
Requires-Dist: transformers>=4.38.2; extra == "all"
Requires-Dist: torch<=2.5.1,>=2.2.2; extra == "all"
Requires-Dist: pandas; extra == "all"
Requires-Dist: rbloom>=1.5.2; extra == "all"
Requires-Dist: pandas>=2.2.2; extra == "all"
Requires-Dist: pyarrow>=16.1.0; extra == "all"
Requires-Dist: huggingface_hub>=0.25.2; extra == "all"
Requires-Dist: transformers>=4.38.2; extra == "all"
Requires-Dist: data_prep_connector>=0.2.3; extra == "all"
Requires-Dist: nltk>=3.9.1; extra == "all"
Requires-Dist: requests; extra == "all"
Requires-Dist: transformers; extra == "all"
Requires-Dist: pandas; extra == "all"
Requires-Dist: psutil; extra == "all"
Requires-Dist: GPUtil; extra == "all"
Requires-Dist: transformers>=4.38.2; extra == "all"
Requires-Dist: data-prep-toolkit-transforms[tokenization]; extra == "all"
Requires-Dist: torch; extra == "all"
Requires-Dist: python-dotenv; extra == "all"
Requires-Dist: pygtrie>=2.5.0; extra == "all"
Provides-Extra: language
Requires-Dist: presidio-analyzer>=2.2.355; extra == "language"
Requires-Dist: presidio-anonymizer>=2.2.355; extra == "language"
Requires-Dist: flair>=0.14.0; extra == "language"
Requires-Dist: pandas; extra == "language"
Requires-Dist: fasttext-wheel; extra == "language"
Requires-Dist: langcodes>=3.3.0; extra == "language"
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4; extra == "language"
Requires-Dist: numpy==1.26.4; extra == "language"
Requires-Dist: docling-core==2.21.2; extra == "language"
Requires-Dist: docling-ibm-models==3.4.1; extra == "language"
Requires-Dist: docling-parse==3.4.0; extra == "language"
Requires-Dist: docling==2.25.1; extra == "language"
Requires-Dist: filetype<2.0.0,>=1.2.0; extra == "language"
Requires-Dist: docling-core>=2.18.0; extra == "language"
Requires-Dist: pydantic>=2.0.0; extra == "language"
Requires-Dist: llama-index-core<0.12.0,>=0.11.22; extra == "language"
Requires-Dist: sentence-transformers>=3.0.1; extra == "language"
Requires-Dist: nltk>=3.9.1; extra == "language"
Requires-Dist: transformers>=4.38.2; extra == "language"
Requires-Dist: pandas; extra == "language"
Requires-Dist: requests; extra == "language"
Requires-Dist: polars>=1.9.0; extra == "language"
Requires-Dist: textstat; extra == "language"
Requires-Dist: pandas; extra == "language"
Requires-Dist: fasttext-wheel; extra == "language"
Requires-Dist: langcodes>=3.5.0; extra == "language"
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4; extra == "language"
Requires-Dist: numpy<1.29.0,>=1.26.4; extra == "language"
Requires-Dist: pyyaml; extra == "language"
Requires-Dist: ftfy; extra == "language"
Requires-Dist: unicategories; extra == "language"
Requires-Dist: unicodedataplus; extra == "language"
Requires-Dist: datatrove; extra == "language"
Requires-Dist: fasttext; extra == "language"
Requires-Dist: nltk; extra == "language"
Requires-Dist: spacy; extra == "language"
Requires-Dist: duckdb>=0.10.1; extra == "language"
Requires-Dist: pandas>=2.2.0; extra == "language"
Requires-Dist: mmh3>=4.1.0; extra == "language"
Requires-Dist: xxhash==3.4.1; extra == "language"
Requires-Dist: pyyaml>=6.0.2; extra == "language"
Requires-Dist: boto3; extra == "language"
Requires-Dist: kubernetes>=30.1.0; extra == "language"
Requires-Dist: polars!=1.10.0,!=1.11.0,!=1.12.0,>=1.9.0; extra == "language"
Requires-Dist: disjoint-set>=0.8.0; extra == "language"
Requires-Dist: scipy<2.0.0,>=1.12.1; extra == "language"
Requires-Dist: numpy<1.29.0; extra == "language"
Requires-Dist: sentencepiece>=0.2.0; extra == "language"
Requires-Dist: mmh3>=4.1.0; extra == "language"
Requires-Dist: nltk==3.9.1; extra == "language"
Requires-Dist: transformers>=4.38.2; extra == "language"
Requires-Dist: torch<=2.5.1,>=2.2.2; extra == "language"
Requires-Dist: pandas; extra == "language"
Requires-Dist: rbloom>=1.5.2; extra == "language"
Requires-Dist: pandas>=2.2.2; extra == "language"
Requires-Dist: pyarrow>=16.1.0; extra == "language"
Requires-Dist: huggingface_hub>=0.25.2; extra == "language"
Requires-Dist: transformers>=4.38.2; extra == "language"
Requires-Dist: data_prep_connector>=0.2.3; extra == "language"
Requires-Dist: mmh3==4.1.0; extra == "language"
Requires-Dist: xxhash==3.4.1; extra == "language"
Requires-Dist: nltk>=3.9.1; extra == "language"
Requires-Dist: requests; extra == "language"
Requires-Dist: transformers; extra == "language"
Requires-Dist: pandas; extra == "language"
Requires-Dist: psutil; extra == "language"
Requires-Dist: GPUtil; extra == "language"
Requires-Dist: transformers>=4.38.2; extra == "language"
Requires-Dist: data-prep-toolkit-transforms[tokenization]; extra == "language"
Requires-Dist: torch; extra == "language"
Requires-Dist: python-dotenv; extra == "language"
Requires-Dist: pygtrie>=2.5.0; extra == "language"
Provides-Extra: code
Requires-Dist: data-prep-toolkit>=0.2.5.dev0; extra == "code"
Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin" and extra == "code"
Requires-Dist: timeout-timer==0.2.0; extra == "code"
Requires-Dist: data-prep-toolkit>=0.2.5.dev0; extra == "code"
Requires-Dist: bs4>=0.0.2; extra == "code"
Requires-Dist: transformers>=4.38.2; extra == "code"
Requires-Dist: pandas; extra == "code"
Requires-Dist: click; extra == "code"
Requires-Dist: httpx-sse; extra == "code"
Requires-Dist: ibm-generative-ai; extra == "code"
Requires-Dist: matplotlib; extra == "code"
Requires-Dist: matplotlib-inline; extra == "code"
Requires-Dist: networkx; extra == "code"
Requires-Dist: numpy; extra == "code"
Requires-Dist: plotly; extra == "code"
Requires-Dist: pyarrow; extra == "code"
Requires-Dist: streamlit; extra == "code"
Requires-Dist: tree-sitter==0.21.3; extra == "code"
Requires-Dist: tree-sitter-languages==1.10.2; extra == "code"
Requires-Dist: uuid; extra == "code"
Requires-Dist: sentence_transformers; extra == "code"
Requires-Dist: clamd==1.0.2; extra == "code"
Requires-Dist: networkx==3.3; extra == "code"
Requires-Dist: colorlog==6.8.2; extra == "code"
Requires-Dist: func-timeout==4.3.5; extra == "code"
Requires-Dist: pandas==2.2.2; extra == "code"
Requires-Dist: emerge-viz==2.0.0; extra == "code"
Requires-Dist: duckdb>=0.10.1; extra == "code"
Requires-Dist: pandas>=2.2.0; extra == "code"
Requires-Dist: mmh3>=4.1.0; extra == "code"
Requires-Dist: xxhash==3.4.1; extra == "code"
Requires-Dist: pyyaml>=6.0.2; extra == "code"
Requires-Dist: boto3; extra == "code"
Requires-Dist: kubernetes>=30.1.0; extra == "code"
Requires-Dist: polars!=1.10.0,!=1.11.0,!=1.12.0,>=1.9.0; extra == "code"
Requires-Dist: disjoint-set>=0.8.0; extra == "code"
Requires-Dist: scipy<2.0.0,>=1.12.1; extra == "code"
Requires-Dist: numpy<1.29.0; extra == "code"
Requires-Dist: sentencepiece>=0.2.0; extra == "code"
Requires-Dist: mmh3>=4.1.0; extra == "code"
Requires-Dist: nltk==3.9.1; extra == "code"
Requires-Dist: transformers>=4.38.2; extra == "code"
Requires-Dist: torch<=2.5.1,>=2.2.2; extra == "code"
Requires-Dist: pandas; extra == "code"
Requires-Dist: rbloom>=1.5.2; extra == "code"
Requires-Dist: pandas>=2.2.2; extra == "code"
Requires-Dist: pyarrow>=16.1.0; extra == "code"
Requires-Dist: huggingface_hub>=0.25.2; extra == "code"
Requires-Dist: transformers>=4.38.2; extra == "code"
Requires-Dist: data_prep_connector>=0.2.3; extra == "code"
Requires-Dist: mmh3==4.1.0; extra == "code"
Requires-Dist: xxhash==3.4.1; extra == "code"
Requires-Dist: nltk>=3.9.1; extra == "code"
Requires-Dist: requests; extra == "code"
Requires-Dist: transformers; extra == "code"
Requires-Dist: pandas; extra == "code"
Requires-Dist: psutil; extra == "code"
Requires-Dist: GPUtil; extra == "code"
Requires-Dist: transformers>=4.38.2; extra == "code"
Requires-Dist: data-prep-toolkit-transforms[tokenization]; extra == "code"
Requires-Dist: torch; extra == "code"
Requires-Dist: python-dotenv; extra == "code"
Requires-Dist: pygtrie>=2.5.0; extra == "code"
Provides-Extra: gneissweb
Requires-Dist: polars>=1.9.0; extra == "gneissweb"
Requires-Dist: textstat; extra == "gneissweb"
Requires-Dist: pandas; extra == "gneissweb"
Requires-Dist: fasttext-wheel; extra == "gneissweb"
Requires-Dist: langcodes>=3.5.0; extra == "gneissweb"
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4; extra == "gneissweb"
Requires-Dist: numpy<1.29.0,>=1.26.4; extra == "gneissweb"
Requires-Dist: duckdb>=0.10.1; extra == "gneissweb"
Requires-Dist: pandas>=2.2.0; extra == "gneissweb"
Requires-Dist: transformers>=4.38.2; extra == "gneissweb"
Requires-Dist: nltk>=3.9.1; extra == "gneissweb"
Requires-Dist: requests; extra == "gneissweb"
Requires-Dist: transformers; extra == "gneissweb"
Requires-Dist: pandas; extra == "gneissweb"
Requires-Dist: psutil; extra == "gneissweb"
Requires-Dist: GPUtil; extra == "gneissweb"
Requires-Dist: rbloom>=1.5.2; extra == "gneissweb"
Requires-Dist: pandas>=2.2.2; extra == "gneissweb"
Requires-Dist: pyarrow>=16.1.0; extra == "gneissweb"
Requires-Dist: huggingface_hub>=0.25.2; extra == "gneissweb"
Requires-Dist: transformers>=4.38.2; extra == "gneissweb"
Requires-Dist: data-prep-toolkit-transforms[tokenization]; extra == "gneissweb"
Requires-Dist: torch; extra == "gneissweb"
Requires-Dist: python-dotenv; extra == "gneissweb"
Provides-Extra: proglang-select
Provides-Extra: header-cleanser
Requires-Dist: data-prep-toolkit>=0.2.5.dev0; extra == "header-cleanser"
Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin" and extra == "header-cleanser"
Requires-Dist: timeout-timer==0.2.0; extra == "header-cleanser"
Provides-Extra: license-select
Provides-Extra: code-quality
Requires-Dist: data-prep-toolkit>=0.2.5.dev0; extra == "code-quality"
Requires-Dist: bs4>=0.0.2; extra == "code-quality"
Requires-Dist: transformers>=4.38.2; extra == "code-quality"
Provides-Extra: code2parquet
Provides-Extra: malware
Provides-Extra: code-profiler
Requires-Dist: pandas; extra == "code-profiler"
Requires-Dist: click; extra == "code-profiler"
Requires-Dist: httpx-sse; extra == "code-profiler"
Requires-Dist: ibm-generative-ai; extra == "code-profiler"
Requires-Dist: matplotlib; extra == "code-profiler"
Requires-Dist: matplotlib-inline; extra == "code-profiler"
Requires-Dist: networkx; extra == "code-profiler"
Requires-Dist: numpy; extra == "code-profiler"
Requires-Dist: plotly; extra == "code-profiler"
Requires-Dist: pyarrow; extra == "code-profiler"
Requires-Dist: streamlit; extra == "code-profiler"
Requires-Dist: tree-sitter==0.21.3; extra == "code-profiler"
Requires-Dist: tree-sitter-languages==1.10.2; extra == "code-profiler"
Requires-Dist: uuid; extra == "code-profiler"
Requires-Dist: sentence_transformers; extra == "code-profiler"
Provides-Extra: repo-level-order
Requires-Dist: networkx==3.3; extra == "repo-level-order"
Requires-Dist: colorlog==6.8.2; extra == "repo-level-order"
Requires-Dist: func-timeout==4.3.5; extra == "repo-level-order"
Requires-Dist: pandas==2.2.2; extra == "repo-level-order"
Requires-Dist: emerge-viz==2.0.0; extra == "repo-level-order"
Provides-Extra: profiler
Requires-Dist: mmh3==4.1.0; extra == "profiler"
Requires-Dist: xxhash==3.4.1; extra == "profiler"
Provides-Extra: resize
Provides-Extra: doc-chunk
Requires-Dist: docling-core>=2.18.0; extra == "doc-chunk"
Requires-Dist: pydantic>=2.0.0; extra == "doc-chunk"
Requires-Dist: llama-index-core<0.12.0,>=0.11.22; extra == "doc-chunk"
Provides-Extra: doc-quality
Provides-Extra: html2parquet
Requires-Dist: trafilatura==1.12.0; extra == "html2parquet"
Provides-Extra: lang-id
Requires-Dist: fasttext-wheel; extra == "lang-id"
Requires-Dist: langcodes>=3.3.0; extra == "lang-id"
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4; extra == "lang-id"
Requires-Dist: numpy==1.26.4; extra == "lang-id"
Provides-Extra: docling2parquet
Requires-Dist: docling-core==2.21.2; extra == "docling2parquet"
Requires-Dist: docling-ibm-models==3.4.1; extra == "docling2parquet"
Requires-Dist: docling-parse==3.4.0; extra == "docling2parquet"
Requires-Dist: docling==2.25.1; extra == "docling2parquet"
Requires-Dist: filetype<2.0.0,>=1.2.0; extra == "docling2parquet"
Provides-Extra: text-encoder
Requires-Dist: sentence-transformers>=3.0.1; extra == "text-encoder"
Provides-Extra: pii-redactor
Requires-Dist: presidio-analyzer>=2.2.355; extra == "pii-redactor"
Requires-Dist: presidio-anonymizer>=2.2.355; extra == "pii-redactor"
Requires-Dist: flair>=0.14.0; extra == "pii-redactor"
Requires-Dist: pandas; extra == "pii-redactor"
Provides-Extra: similarity
Requires-Dist: nltk>=3.9.1; extra == "similarity"
Requires-Dist: transformers>=4.38.2; extra == "similarity"
Requires-Dist: pandas; extra == "similarity"
Requires-Dist: requests; extra == "similarity"
Provides-Extra: extreme-tokenized
Requires-Dist: polars>=1.9.0; extra == "extreme-tokenized"
Provides-Extra: readability
Requires-Dist: textstat; extra == "readability"
Requires-Dist: pandas; extra == "readability"
Provides-Extra: gneissweb-classification
Requires-Dist: fasttext-wheel; extra == "gneissweb-classification"
Requires-Dist: langcodes>=3.5.0; extra == "gneissweb-classification"
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4; extra == "gneissweb-classification"
Requires-Dist: numpy<1.29.0,>=1.26.4; extra == "gneissweb-classification"
Provides-Extra: ml-filter
Requires-Dist: pyyaml; extra == "ml-filter"
Provides-Extra: enrichment
Requires-Dist: ftfy; extra == "enrichment"
Requires-Dist: unicategories; extra == "enrichment"
Requires-Dist: unicodedataplus; extra == "enrichment"
Requires-Dist: datatrove; extra == "enrichment"
Requires-Dist: fasttext; extra == "enrichment"
Requires-Dist: nltk; extra == "enrichment"
Requires-Dist: spacy; extra == "enrichment"
Provides-Extra: filter
Requires-Dist: duckdb>=0.10.1; extra == "filter"
Requires-Dist: pandas>=2.2.0; extra == "filter"
Provides-Extra: doc-id
Provides-Extra: hap
Requires-Dist: nltk==3.9.1; extra == "hap"
Requires-Dist: transformers>=4.38.2; extra == "hap"
Requires-Dist: torch<=2.5.1,>=2.2.2; extra == "hap"
Requires-Dist: pandas; extra == "hap"
Provides-Extra: bloom
Requires-Dist: rbloom>=1.5.2; extra == "bloom"
Requires-Dist: pandas>=2.2.2; extra == "bloom"
Requires-Dist: pyarrow>=16.1.0; extra == "bloom"
Requires-Dist: huggingface_hub>=0.25.2; extra == "bloom"
Provides-Extra: ededup
Requires-Dist: mmh3>=4.1.0; extra == "ededup"
Requires-Dist: xxhash==3.4.1; extra == "ededup"
Provides-Extra: fdedup
Requires-Dist: pyyaml>=6.0.2; extra == "fdedup"
Requires-Dist: boto3; extra == "fdedup"
Requires-Dist: kubernetes>=30.1.0; extra == "fdedup"
Requires-Dist: polars!=1.10.0,!=1.11.0,!=1.12.0,>=1.9.0; extra == "fdedup"
Requires-Dist: disjoint-set>=0.8.0; extra == "fdedup"
Requires-Dist: scipy<2.0.0,>=1.12.1; extra == "fdedup"
Requires-Dist: numpy<1.29.0; extra == "fdedup"
Requires-Dist: sentencepiece>=0.2.0; extra == "fdedup"
Requires-Dist: mmh3>=4.1.0; extra == "fdedup"
Provides-Extra: tokenization
Requires-Dist: transformers>=4.38.2; extra == "tokenization"
Provides-Extra: web2parquet
Requires-Dist: data_prep_connector>=0.2.3; extra == "web2parquet"
Provides-Extra: rep-removal
Requires-Dist: nltk>=3.9.1; extra == "rep-removal"
Requires-Dist: requests; extra == "rep-removal"
Requires-Dist: transformers; extra == "rep-removal"
Requires-Dist: pandas; extra == "rep-removal"
Requires-Dist: psutil; extra == "rep-removal"
Requires-Dist: GPUtil; extra == "rep-removal"
Provides-Extra: tokenization2arrow
Requires-Dist: transformers>=4.38.2; extra == "tokenization2arrow"
Requires-Dist: data-prep-toolkit-transforms[tokenization]; extra == "tokenization2arrow"
Requires-Dist: torch; extra == "tokenization2arrow"
Requires-Dist: python-dotenv; extra == "tokenization2arrow"
Provides-Extra: collapse
Provides-Extra: blocklist
Requires-Dist: pygtrie>=2.5.0; extra == "blocklist"

# DPK Python Transforms

## installation

The [transforms](https://github.com/data-prep-kit/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install:

`python -m pip install data-prep-toolkit-transforms[all]`
or
`python -m pip install data-prep-toolkit-transforms[ray, all]`
or
`python -m pip install data-prep-toolkit-transforms[language]`


installing the python transforms will also install  `data-prep-toolkit`

installing the ray transforms will also install  `data-prep-toolkit[ray]`

## Release notes:

### 1.1.1.dev1
	Include all code transforms as extra [code]

### 1.1.1.dev0
	Refactored code transforms (code_uality, code2parquet, header_cleanser, license select, proglang_select)
	Added ml-filter and enrichment
	renamed PDF2Parquet to Docling2Paruqet 

### 1.0.1.dev1
	Added Gneissweb transforms
	fdedup fix for windows
### 1.0.1.dev0
	PR #979 (code_profiler)
### 1.0.0.a6
	Added Profiler
	Added Resize
### 1.0.0.a5
	Added Pii Redactor
	Relax fasttext requirement >= 0.9.2
### 1.0.0.a4
	Added missing ray implementation for lang_id, doc_quality, tokenization and filter
	Added ray notebooks for lang id, Doc Quality, tokenization, and Filter
### 1.0.0.a3
	Added code_profiler
### 1.0.0.a2
   Relax dependencies on pandas (use latest or whatever is installed by application)
   Relax dependencies on requests (use latest or whatever is installed by application)



 
