Skip to content

Commit

Permalink
Added script for cleaning data.
Browse files Browse the repository at this point in the history
  • Loading branch information
souradipp76 committed May 20, 2024
1 parent 76013a2 commit 9dad00a
Showing 1 changed file with 63 additions and 18 deletions.
81 changes: 63 additions & 18 deletions scripts/data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -305,24 +305,6 @@
"#github_urls = ['https://github.com/TencentARC/GFPGAN', 'https://github.com/apache/airflow', 'https://github.com/chenfei-wu/TaskMatrix', 'https://github.com/mitmproxy/mitmproxy', 'https://github.com/lm-sys/FastChat', 'https://github.com/comfyanonymous/ComfyUI', 'https://github.com/babysor/MockingBird', 'https://github.com/openai/gym', 'https://github.com/testerSunshine/12306', 'https://github.com/shadowsocks/shadowsocks', 'https://github.com/microsoft/DeepSpeed', 'https://github.com/XX-net/XX-Net', 'https://github.com/fxsjy/jieba', 'https://github.com/hankcs/HanLP', 'https://github.com/Asabeneh/30-Days-Of-Python', 'https://github.com/karpathy/nanoGPT', 'https://github.com/httpie/cli', 'https://github.com/streamlit/streamlit', 'https://github.com/ccxt/ccxt', 'https://github.com/run-llama/llama_index', 'https://github.com/ray-project/ray', 'https://github.com/certbot/certbot', 'https://github.com/sqlmapproject/sqlmap', 'https://github.com/geekcomputers/Python', 'https://github.com/huggingface/pytorch-image-models', 'https://github.com/coqui-ai/TTS', 'https://github.com/python-poetry/poetry', 'https://github.com/0xAX/linux-insides', 'https://github.com/facebookresearch/fairseq', 'https://github.com/gradio-app/gradio', 'https://github.com/yunjey/pytorch-tutorial', 'https://github.com/tatsu-lab/stanford_alpaca', 'https://github.com/explosion/spaCy', 'https://github.com/donnemartin/interactive-coding-challenges', 'https://github.com/facebookresearch/detectron2', 'https://github.com/Pythagora-io/gpt-pilot', 'https://github.com/google/jax', 'https://github.com/lllyasviel/ControlNet', 'https://github.com/acheong08/ChatGPT', 'https://github.com/open-mmlab/mmdetection', 'https://github.com/chatchat-space/Langchain-Chatchat', 'https://github.com/encode/django-rest-framework', 'https://github.com/tqdm/tqdm', 'https://github.com/Lightning-AI/pytorch-lightning', 'https://github.com/LC044/WeChatMsg', 'https://github.com/OWASP/CheatSheetSeries', 'https://github.com/donnemartin/data-science-ipython-notebooks', 'https://github.com/numpy/numpy', 'https://github.com/google/python-fire', 'https://github.com/xinntao/Real-ESRGAN', 'https://github.com/OpenBB-finance/OpenBBTerminal', 'https://github.com/facebookresearch/Detectron', 'https://github.com/freqtrade/freqtrade', 'https://github.com/StevenBlack/hosts', 'https://github.com/ycm-core/YouCompleteMe', 'https://github.com/spipm/Depix', 'https://github.com/zhayujie/chatgpt-on-wechat', 'https://github.com/littlecodersh/ItChat', 'https://github.com/nicolargo/glances', 'https://github.com/s0md3v/roop', 'https://github.com/getredash/redash', 'https://github.com/deezer/spleeter', 'https://github.com/Vision-CAIR/MiniGPT-4', 'https://github.com/python-telegram-bot/python-telegram-bot', 'https://github.com/pypa/pipenv', 'https://github.com/myshell-ai/OpenVoice', 'https://github.com/OpenDevin/OpenDevin', 'https://github.com/microsoft/cascadia-code', 'https://github.com/matterport/Mask_RCNN', 'https://github.com/tinygrad/tinygrad', 'https://github.com/svc-develop-team/so-vits-svc', 'https://github.com/RVC-Boss/GPT-SoVITS', 'https://github.com/jumpserver/jumpserver', 'https://github.com/locustio/locust', 'https://github.com/chubin/wttr.in', 'https://github.com/Textualize/textual', 'https://github.com/celery/celery', 'https://github.com/keon/algorithms', 'https://github.com/vnpy/vnpy', 'https://github.com/iperov/DeepFaceLive', 'https://github.com/ultralytics/ultralytics', 'https://github.com/eriklindernoren/ML-From-Scratch', 'https://github.com/microsoft/JARVIS', 'https://github.com/huggingface/diffusers', 'https://github.com/wangzheng0822/algo', 'https://github.com/mouredev/Hello-Python', 'https://github.com/Stability-AI/generative-models', 'https://github.com/sebastianruder/NLP-progress', 'https://github.com/JaidedAI/EasyOCR', 'https://github.com/kovidgoyal/kitty', 'https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix', 'https://github.com/HumanSignal/labelImg', 'https://github.com/d2l-ai/d2l-en', 'https://github.com/AtsushiSakai/PythonRobotics', 'https://github.com/pytorch/examples', 'https://github.com/cookiecutter/cookiecutter', 'https://github.com/tornadoweb/tornado', 'https://github.com/hiyouga/LLaMA-Factory', 'https://github.com/mindsdb/mindsdb', 'https://github.com/deepinsight/insightface', 'https://github.com/openai/gpt-2', 'https://github.com/luong-komorebi/Awesome-Linux-Software', 'https://github.com/WZMIAOMIAO/deep-learning-for-image-processing', 'https://github.com/drduh/macOS-Security-and-Privacy-Guide', 'https://github.com/openai/chatgpt-retrieval-plugin', 'https://github.com/plotly/dash', 'https://github.com/chriskiehl/Gooey', 'https://github.com/jhao104/proxy_pool', 'https://github.com/pyg-team/pytorch_geometric', 'https://github.com/saleor/saleor', 'https://github.com/zulip/zulip', 'https://github.com/jina-ai/jina', 'https://github.com/openai/openai-python', 'https://github.com/KurtBestor/Hitomi-Downloader', 'https://github.com/521xueweihan/GitHub520', 'https://github.com/ArchiveBox/ArchiveBox', 'https://github.com/facebookresearch/audiocraft', 'https://github.com/meta-llama/llama3', 'https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI', 'https://github.com/matplotlib/matplotlib', 'https://github.com/yoheinakajima/babyagi', 'https://github.com/Vonng/ddia', 'https://github.com/PromtEngineer/localGPT', 'https://github.com/vllm-project/vllm', 'https://github.com/ManimCommunity/manim', 'https://github.com/ungoogled-software/ungoogled-chromium', 'https://github.com/karpathy/minGPT', 'https://github.com/magenta/magenta', 'https://github.com/bokeh/bokeh', 'https://github.com/pydantic/pydantic', 'https://github.com/huggingface/datasets', 'https://github.com/microsoft/unilm', 'https://github.com/kholia/OSX-KVM', 'https://github.com/kovidgoyal/calibre', 'https://github.com/mkdocs/mkdocs', 'https://github.com/magic-wormhole/magic-wormhole', 'https://github.com/Delgan/loguru', 'https://github.com/lucidrains/vit-pytorch', 'https://github.com/nginx-proxy/nginx-proxy', 'https://github.com/recommenders-team/recommenders', 'https://github.com/RasaHQ/rasa', 'https://github.com/facebook/prophet', 'https://github.com/sanic-org/sanic', 'https://github.com/kaixindelele/ChatPaper', 'https://github.com/Jack-Cherish/python-spider', 'https://github.com/jantic/DeOldify', 'https://github.com/python/mypy', 'https://github.com/ymcui/Chinese-LLaMA-Alpaca', 'https://github.com/pyscript/pyscript', 'https://github.com/PostHog/posthog', 'https://github.com/mlflow/mlflow', 'https://github.com/spotify/luigi', 'https://github.com/wagtail/wagtail', 'https://github.com/Sanster/IOPaint', 'https://github.com/miloyip/game-programmer', 'https://github.com/joke2k/faker', 'https://github.com/mlc-ai/mlc-llm', 'https://github.com/Ciphey/Ciphey', 'https://github.com/quantopian/zipline', 'https://github.com/paperless-ngx/paperless-ngx', 'https://github.com/frappe/erpnext', 'https://github.com/stitionai/devika', 'https://github.com/rsms/inter', 'https://github.com/kivy/kivy', 'https://github.com/reflex-dev/reflex', 'https://github.com/onnx/onnx', 'https://github.com/reddit-archive/reddit', 'https://github.com/hpcaitech/Open-Sora', 'https://github.com/haotian-liu/LLaVA', 'https://github.com/chatanywhere/GPT_API_free', 'https://github.com/InstaPy/InstaPy', 'https://github.com/binux/pyspider', 'https://github.com/LiLittleCat/awesome-free-chatgpt', 'https://github.com/cool-RR/PySnooper', 'https://github.com/apple/ml-stable-diffusion', 'https://github.com/ipython/ipython', 'https://github.com/wilsonfreitas/awesome-quant', 'https://github.com/alievk/avatarify-python', 'https://github.com/Mikubill/sd-webui-controlnet', 'https://github.com/wting/autojump', 'https://github.com/trekhleb/learn-python', 'https://github.com/eriklindernoren/PyTorch-GAN', 'https://github.com/Kr1s77/awesome-python-login-model', 'https://github.com/twintproject/twint', 'https://github.com/THUDM/ChatGLM2-6B', 'https://github.com/wistbean/learn_python3_spider', 'https://github.com/mnielsen/neural-networks-and-deep-learning', 'https://github.com/pytorch/vision', 'https://github.com/h2y/Shadowrocket-ADBlock-Rules', 'https://github.com/OpenEthan/SMSBoom', 'https://github.com/openai/baselines', 'https://github.com/plotly/plotly.py', 'https://github.com/piskvorky/gensim', 'https://github.com/RunaCapital/awesome-oss-alternatives', 'https://github.com/meta-llama/codellama', 'https://github.com/pallets/click', 'https://github.com/spotDL/spotify-downloader', 'https://github.com/dgtlmoon/changedetection.io', 'https://github.com/Anjok07/ultimatevocalremovergui', 'https://github.com/netbox-community/netbox', 'https://github.com/mxrch/GHunt', 'https://github.com/ranger/ranger', 'https://github.com/tensorflow/tensor2tensor', 'https://github.com/aws/aws-cli', 'https://github.com/blakeblackshear/frigate', 'https://github.com/w-okada/voice-changer', 'https://github.com/GaiZhenbiao/ChuanhuChatGPT', 'https://github.com/PrefectHQ/prefect', 'https://github.com/jupyter/jupyter', 'https://github.com/facefusion/facefusion', 'https://github.com/danielgatis/rembg', 'https://github.com/borisdayma/dalle-mini', 'https://github.com/fabric/fabric', 'https://github.com/aio-libs/aiohttp', 'https://github.com/ddbourgin/numpy-ml', 'https://github.com/TransformerOptimus/SuperAGI', 'https://github.com/microsoft/Bringing-Old-Photos-Back-to-Life', 'https://github.com/pyecharts/pyecharts', 'https://github.com/tiangolo/typer', 'https://github.com/Rapptz/discord.py', 'https://github.com/fauxpilot/fauxpilot', 'https://github.com/lra/mackup', 'https://github.com/apprenticeharper/DeDRM_tools', 'https://github.com/microsoft/qlib', 'https://github.com/networkx/networkx', 'https://github.com/powerline/powerline', 'https://github.com/arc53/DocsGPT', 'https://github.com/Python-World/python-mini-projects', 'https://github.com/airbytehq/airbyte', 'https://github.com/aleju/imgaug', 'https://github.com/roboflow/supervision', 'https://github.com/pjialin/py12306', 'https://github.com/hindupuravinash/the-gan-zoo', 'https://github.com/unifyai/ivy', 'https://github.com/openai/evals', 'https://github.com/horovod/horovod', 'https://github.com/huggingface/peft', 'https://github.com/NVlabs/stylegan', 'https://github.com/tgbot-collection/YYeTsBot', 'https://github.com/gunthercox/ChatterBot', 'https://github.com/UKPLab/sentence-transformers', 'https://github.com/saltstack/salt', 'https://github.com/wangshub/wechat_jump_game', 'https://github.com/youfou/wxpy', 'https://github.com/microsoft/nni', 'https://github.com/deepset-ai/haystack', 'https://github.com/codelucas/newspaper', 'https://github.com/joaomdmoura/crewAI', 'https://github.com/google/yapf', 'https://github.com/psf/requests-html', 'https://github.com/flairNLP/flair', 'https://github.com/sczhou/CodeFormer', 'https://github.com/shengqiangzhang/examples-of-web-crawlers', 'https://github.com/davidsandberg/facenet', 'https://github.com/NanmiCoder/MediaCrawler', 'https://github.com/ansible/awx', 'https://github.com/albumentations-team/albumentations', 'https://github.com/programthink/zhao', 'https://github.com/mail-in-a-box/mailinabox', 'https://github.com/sivel/speedtest-cli', 'https://github.com/searx/searx', 'https://github.com/ShangtongZhang/reinforcement-learning-an-introduction', 'https://github.com/iterative/dvc', 'https://github.com/PySimpleGUI/PySimpleGUI', 'https://github.com/mementum/backtrader', 'https://github.com/tiangolo/sqlmodel', 'https://github.com/nltk/nltk', 'https://github.com/dmlc/dgl', 'https://github.com/microsoft/Swin-Transformer', 'https://github.com/jindongwang/transferlearning', 'https://github.com/facebookresearch/detr', 'https://github.com/idank/explainshell', 'https://github.com/s0md3v/XSStrike', 'https://github.com/fortra/impacket', 'https://github.com/MetaCubeX/mihomo', 'https://github.com/wifiphisher/wifiphisher', 'https://github.com/jaakkopasanen/AutoEq', 'https://github.com/waditu/tushare', 'https://github.com/edgedb/edgedb', 'https://github.com/bloomberg/memray', 'https://github.com/ethereum/EIPs', 'https://github.com/PaddlePaddle/PaddleHub', 'https://github.com/scipy/scipy', 'https://github.com/chroma-core/chroma', 'https://github.com/sympy/sympy', 'https://github.com/beetbox/beets', 'https://github.com/postmanlabs/httpbin', 'https://github.com/labelmeai/labelme', 'https://github.com/SFTtech/openage', 'https://github.com/encode/httpx', 'https://github.com/redis/redis-py', 'https://github.com/getpelican/pelican', 'https://github.com/THUDM/ChatGLM3', 'https://github.com/jina-ai/clip-as-service', 'https://github.com/donnemartin/awesome-aws', 'https://github.com/microsoft/pyright', 'https://github.com/pre-commit/pre-commit', 'https://github.com/PaddlePaddle/PaddleDetection', 'https://github.com/ocrmypdf/OCRmyPDF', 'https://github.com/lss233/chatgpt-mirai-qq-bot', 'https://github.com/ydataai/ydata-profiling', 'https://github.com/dask/dask', 'https://github.com/mwaskom/seaborn', 'https://github.com/ranaroussi/yfinance', 'https://github.com/tonybeltramelli/pix2code', 'https://github.com/threat9/routersploit', 'https://github.com/Miserlou/Zappa', 'https://github.com/alexjc/neural-enhance', 'https://github.com/Zulko/moviepy', 'https://github.com/meolu/walle-web', 'https://github.com/OpenMOSS/MOSS', 'https://github.com/smicallef/spiderfoot', 'https://github.com/matrix-org/synapse', 'https://github.com/google-deepmind/alphafold', 'https://github.com/dbcli/pgcli', 'https://github.com/python-pillow/Pillow', 'https://github.com/BlinkDL/RWKV-LM', 'https://github.com/allenai/allennlp', 'https://github.com/LlamaFamily/Llama-Chinese', 'https://github.com/smol-ai/developer', 'https://github.com/janeczku/calibre-web', 'https://github.com/Embedding/Chinese-Word-Vectors', 'https://github.com/cookiecutter/cookiecutter-django', 'https://github.com/rougier/numpy-100', 'https://github.com/zalandoresearch/fashion-mnist']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a2b9af3",
"metadata": {},
"outputs": [],
"source": [
"def remove_urls(text):\n",
" \"\"\"Remove URLs from a given text string.\"\"\"\n",
" url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\\\(\\\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'\n",
" return re.sub(url_pattern, '', text)\n",
"\n",
"def remove_html_tags(text):\n",
" \"\"\"Remove HTML tags from a given text string.\"\"\"\n",
" html_pattern = r'<.*?>'\n",
" return re.sub(html_pattern, '', text)"
]
},
{
"cell_type": "code",
"execution_count": 6,
Expand Down Expand Up @@ -5078,6 +5060,69 @@
" df3 = pd.DataFrame(new_rows, index=None)\n",
" df3.to_csv(os.path.join(root_dir, \"readme_qa.csv\"), mode=\"a\")"
]
},
{
"cell_type": "markdown",
"id": "fd7595e4",
"metadata": {},
"source": [
"# Clean Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62492c39",
"metadata": {},
"outputs": [],
"source": [
"def remove_urls(text):\n",
" \"\"\"Remove URLs from a given text string.\"\"\"\n",
" url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\\\(\\\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'\n",
" return re.sub(url_pattern, '', text)\n",
"\n",
"def remove_html_tags(text):\n",
" \"\"\"Remove HTML tags from a given text string.\"\"\"\n",
" html_pattern = r'<.*?>'\n",
" return re.sub(html_pattern, '', text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "21ef45fa",
"metadata": {},
"outputs": [],
"source": [
"def clean_text(text):\n",
" # Define the regular expression pattern for HTTP URLs\n",
" http_pattern = re.compile(r'http://[^\\s]+')\n",
" # Remove HTTP URLs\n",
" text = http_pattern.sub('', str(text))\n",
"\n",
" https_pattern = re.compile(r'https://[^\\s]+')\n",
" # Remove HTTPS URLs\n",
" text = https_pattern.sub('', str(text))\n",
" \n",
" # Define the regular expression pattern for <img> tags\n",
" img_pattern = re.compile(r'<img[^>]*>')\n",
" # Remove <img> tags\n",
" text = img_pattern.sub('', str(text))\n",
" \n",
" return text"
]
},
{
"cell_type": "markdown",
"id": "1eeda830",
"metadata": {},
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\"readme_qa.csv\")\n",
"df.dropna(subset=[\"Answer\"], inplace=True)\n",
"df[\"Answer\"] = df[\"Answer\"].apply(clean_text)\n",
"df.to_csv(\"readme_qa_cleaned.csv\", index=False)"
]
}
],
"metadata": {
Expand Down

0 comments on commit 9dad00a

Please sign in to comment.