From 01a5ed40529685fdc64b360b07eadf0ae23c0794 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Fri, 1 Dec 2023 21:46:40 -0300 Subject: [PATCH 01/23] Update Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit adicionando a env var TMPDIR para ver se ganho mais espaço com o podman no codespaces Signed-off-by: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> --- scripts/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/Dockerfile b/scripts/Dockerfile index 139d337..79aaf7b 100644 --- a/scripts/Dockerfile +++ b/scripts/Dockerfile @@ -1,5 +1,6 @@ FROM docker.io/python:3.8 +ENV TMPDIR /tmp ENV USER gazette ENV USER_HOME /home/$USER ENV WORKDIR /mnt/code From c01aa33c7a5713e5deb17427cbfaf7e4ea9e99ae Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Sat, 2 Dec 2023 01:09:50 +0000 Subject: [PATCH 02/23] Revert "Update Dockerfile" This reverts commit 01a5ed40529685fdc64b360b07eadf0ae23c0794. --- scripts/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/Dockerfile b/scripts/Dockerfile index 79aaf7b..139d337 100644 --- a/scripts/Dockerfile +++ b/scripts/Dockerfile @@ -1,6 +1,5 @@ FROM docker.io/python:3.8 -ENV TMPDIR /tmp ENV USER gazette ENV USER_HOME /home/$USER ENV WORKDIR /mnt/code From a0220c04a423f35cadaceb2bccf768cc82888485 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Sat, 2 Dec 2023 01:16:13 +0000 Subject: [PATCH 03/23] change podman to docker in build command --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 34e510d..8cfb7fd 100644 --- a/Makefile +++ b/Makefile @@ -56,12 +56,12 @@ black: .PHONY: build-devel build-devel: - podman build --tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) \ + docker build --tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) \ -f scripts/Dockerfile $(PWD) .PHONY: build-tika-server build-tika-server: - podman build --tag $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ + docket build --tag $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ -f scripts/Dockerfile_apache_tika $(PWD) .PHONY: build From e7b41bc6de6ec168ba50f9e3e254bcc4c1ae1484 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Sat, 2 Dec 2023 01:44:19 +0000 Subject: [PATCH 04/23] fix typo --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8cfb7fd..3be5f02 100644 --- a/Makefile +++ b/Makefile @@ -61,7 +61,7 @@ build-devel: .PHONY: build-tika-server build-tika-server: - docket build --tag $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ + docker build --tag $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ -f scripts/Dockerfile_apache_tika $(PWD) .PHONY: build From 934793c42cd04591d853636e21f146e0ad32be4d Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Sat, 2 Dec 2023 01:53:33 +0000 Subject: [PATCH 05/23] changing podman to docker in makefile for setup --- Makefile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 3be5f02..4cddbfa 100644 --- a/Makefile +++ b/Makefile @@ -82,10 +82,10 @@ destroy: podman rmi --force $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) destroy-pod: - podman pod rm --force --ignore $(POD_NAME) + docker pod rm --force --ignore $(POD_NAME) create-pod: destroy-pod - podman pod create -p $(POSTGRES_PORT):$(POSTGRES_PORT) \ + docker pod create -p $(POSTGRES_PORT):$(POSTGRES_PORT) \ -p $(OPENSEARCH_PORT1):$(OPENSEARCH_PORT1) \ -p $(STORAGE_PORT):$(STORAGE_PORT) \ --name $(POD_NAME) @@ -124,13 +124,13 @@ retest-tika: $(call run-command, python -m unittest -f tests/text_extraction_tests.py) start-apache-tika-server: - podman run -d --pod $(POD_NAME) --name $(APACHE_TIKA_CONTAINER_NAME) \ + docker run -d --pod $(POD_NAME) --name $(APACHE_TIKA_CONTAINER_NAME) \ $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ java -jar /tika-server.jar stop-apache-tika-server: - podman stop --ignore $(APACHE_TIKA_CONTAINER_NAME) - podman rm --force --ignore $(APACHE_TIKA_CONTAINER_NAME) + docker stop --ignore $(APACHE_TIKA_CONTAINER_NAME) + docker rm --force --ignore $(APACHE_TIKA_CONTAINER_NAME) .PHONY: apache-tika-server apache-tika-server: stop-apache-tika-server start-apache-tika-server @@ -151,13 +151,13 @@ coverage: prepare-test-env .PHONY: stop-storage stop-storage: - podman rm --force --ignore $(STORAGE_CONTAINER_NAME) + docker rm --force --ignore $(STORAGE_CONTAINER_NAME) .PHONY: storage storage: stop-storage start-storage wait-storage start-storage: - podman run -d --rm -ti \ + docker run -d --rm -ti \ --name $(STORAGE_CONTAINER_NAME) \ --pod $(POD_NAME) \ -e MINIO_ACCESS_KEY=$(STORAGE_ACCESS_KEY) \ @@ -170,13 +170,13 @@ wait-storage: .PHONY: stop-database stop-database: - podman rm --force --ignore $(DATABASE_CONTAINER_NAME) + docker rm --force --ignore $(DATABASE_CONTAINER_NAME) .PHONY: database database: stop-database start-database wait-database start-database: - podman run -d --rm -ti \ + docker run -d --rm -ti \ --name $(DATABASE_CONTAINER_NAME) \ --pod $(POD_NAME) \ -e POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ @@ -238,7 +238,7 @@ shell-database: set-run-variable-values opensearch: stop-opensearch start-opensearch wait-opensearch start-opensearch: - podman run -d --rm -ti \ + docker run -d --rm -ti \ --name $(OPENSEARCH_CONTAINER_NAME) \ --pod $(POD_NAME) \ --env discovery.type=single-node \ @@ -246,7 +246,7 @@ start-opensearch: docker.io/opensearchproject/opensearch:2.9.0 stop-opensearch: - podman rm --force --ignore $(OPENSEARCH_CONTAINER_NAME) + docker rm --force --ignore $(OPENSEARCH_CONTAINER_NAME) wait-opensearch: $(call wait-for, localhost:9200) From c5f2974bdb103784e265a3bfeb4cc90bcf56c065 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Mon, 18 Dec 2023 22:22:38 -0300 Subject: [PATCH 06/23] change podman to docker in wait-for Signed-off-by: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4cddbfa..5f3d5b4 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ run-command=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ --env POSTGRES_PORT=$(POSTGRES_PORT) \ $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) $1) -wait-for=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ +wait-for=(docker run --rm -ti --volume $(PWD):/mnt/code:rw \ --pod $(POD_NAME) \ --env PYTHONPATH=/mnt/code \ --env POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ From 721aff31f1ecb1758393d9983eaf7ddec57da973 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Tue, 19 Dec 2023 02:31:06 +0000 Subject: [PATCH 07/23] change podman to docker in create-pod and destroy-pod --- Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 5f3d5b4..8c341d9 100644 --- a/Makefile +++ b/Makefile @@ -82,13 +82,14 @@ destroy: podman rmi --force $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) destroy-pod: - docker pod rm --force --ignore $(POD_NAME) + docker rm --force --ignore $(POD_NAME) create-pod: destroy-pod - docker pod create -p $(POSTGRES_PORT):$(POSTGRES_PORT) \ - -p $(OPENSEARCH_PORT1):$(OPENSEARCH_PORT1) \ - -p $(STORAGE_PORT):$(STORAGE_PORT) \ - --name $(POD_NAME) + docker container run -d -p $(POSTGRES_PORT) \ + -p $(OPENSEARCH_PORT1) \ + -p $(STORAGE_PORT) \ + --name $(POD_NAME) \ + $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) prepare-test-env: create-pod storage apache-tika-server opensearch database From 89ba401040bef587704889c25fc24836964f7abd Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Tue, 19 Dec 2023 02:35:24 +0000 Subject: [PATCH 08/23] connecting containers to 'pod' --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 8c341d9..eaf235e 100644 --- a/Makefile +++ b/Makefile @@ -125,7 +125,7 @@ retest-tika: $(call run-command, python -m unittest -f tests/text_extraction_tests.py) start-apache-tika-server: - docker run -d --pod $(POD_NAME) --name $(APACHE_TIKA_CONTAINER_NAME) \ + docker run -d --network container:$(POD_NAME) --name $(APACHE_TIKA_CONTAINER_NAME) \ $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ java -jar /tika-server.jar @@ -160,7 +160,7 @@ storage: stop-storage start-storage wait-storage start-storage: docker run -d --rm -ti \ --name $(STORAGE_CONTAINER_NAME) \ - --pod $(POD_NAME) \ + --network container:$(POD_NAME) \ -e MINIO_ACCESS_KEY=$(STORAGE_ACCESS_KEY) \ -e MINIO_SECRET_KEY=$(STORAGE_ACCESS_SECRET) \ -e MINIO_DEFAULT_BUCKETS=$(STORAGE_BUCKET):public \ @@ -179,7 +179,7 @@ database: stop-database start-database wait-database start-database: docker run -d --rm -ti \ --name $(DATABASE_CONTAINER_NAME) \ - --pod $(POD_NAME) \ + --network container:$(POD_NAME) \ -e POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ -e POSTGRES_USER=$(POSTGRES_USER) \ -e POSTGRES_DB=$(POSTGRES_DB) \ @@ -241,7 +241,7 @@ opensearch: stop-opensearch start-opensearch wait-opensearch start-opensearch: docker run -d --rm -ti \ --name $(OPENSEARCH_CONTAINER_NAME) \ - --pod $(POD_NAME) \ + --network container:$(POD_NAME) \ --env discovery.type=single-node \ --env plugins.security.ssl.http.enabled=false \ docker.io/opensearchproject/opensearch:2.9.0 From 65e7f30e949fdfc9a4d76f0d9f6708cde9cba9f2 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Tue, 19 Dec 2023 02:40:05 +0000 Subject: [PATCH 09/23] remove podman exclusive flag for docker rm --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eaf235e..a962ac9 100644 --- a/Makefile +++ b/Makefile @@ -82,7 +82,7 @@ destroy: podman rmi --force $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) destroy-pod: - docker rm --force --ignore $(POD_NAME) + docker rm --force $(POD_NAME) create-pod: destroy-pod docker container run -d -p $(POSTGRES_PORT) \ From 0ae3e7cbffe7ef221251bcf120131473146afeb6 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Tue, 19 Dec 2023 02:42:43 +0000 Subject: [PATCH 10/23] remove all --ignore flags --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index a962ac9..72b0122 100644 --- a/Makefile +++ b/Makefile @@ -131,7 +131,7 @@ start-apache-tika-server: stop-apache-tika-server: docker stop --ignore $(APACHE_TIKA_CONTAINER_NAME) - docker rm --force --ignore $(APACHE_TIKA_CONTAINER_NAME) + docker rm --force $(APACHE_TIKA_CONTAINER_NAME) .PHONY: apache-tika-server apache-tika-server: stop-apache-tika-server start-apache-tika-server @@ -152,7 +152,7 @@ coverage: prepare-test-env .PHONY: stop-storage stop-storage: - docker rm --force --ignore $(STORAGE_CONTAINER_NAME) + docker rm --force $(STORAGE_CONTAINER_NAME) .PHONY: storage storage: stop-storage start-storage wait-storage @@ -171,7 +171,7 @@ wait-storage: .PHONY: stop-database stop-database: - docker rm --force --ignore $(DATABASE_CONTAINER_NAME) + docker rm --force $(DATABASE_CONTAINER_NAME) .PHONY: database database: stop-database start-database wait-database @@ -247,7 +247,7 @@ start-opensearch: docker.io/opensearchproject/opensearch:2.9.0 stop-opensearch: - docker rm --force --ignore $(OPENSEARCH_CONTAINER_NAME) + docker rm --force $(OPENSEARCH_CONTAINER_NAME) wait-opensearch: $(call wait-for, localhost:9200) From a12a3f9d0eca9aa917672fb110e29b6f9873a7f1 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Tue, 19 Dec 2023 02:54:52 +0000 Subject: [PATCH 11/23] add sleep command to docker run when creating the 'pod' --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 72b0122..ece5e0e 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ create-pod: destroy-pod -p $(OPENSEARCH_PORT1) \ -p $(STORAGE_PORT) \ --name $(POD_NAME) \ - $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) + $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) sleep 3600 prepare-test-env: create-pod storage apache-tika-server opensearch database From f4fb2ac89632742b4f00d3cdf133b22d5e5c2f2e Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Tue, 19 Dec 2023 02:58:09 +0000 Subject: [PATCH 12/23] change podman to docker in wait-for (--pod flag) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ece5e0e..d24f72b 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ run-command=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) $1) wait-for=(docker run --rm -ti --volume $(PWD):/mnt/code:rw \ - --pod $(POD_NAME) \ + --network container:$(POD_NAME) \ --env PYTHONPATH=/mnt/code \ --env POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ --env POSTGRES_USER=$(POSTGRES_USER) \ From 0f9187f638f68cac8253988a623c8982349ffc2d Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Tue, 19 Dec 2023 02:59:47 +0000 Subject: [PATCH 13/23] remove --ignore from docker stop --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d24f72b..27f5734 100644 --- a/Makefile +++ b/Makefile @@ -130,7 +130,7 @@ start-apache-tika-server: java -jar /tika-server.jar stop-apache-tika-server: - docker stop --ignore $(APACHE_TIKA_CONTAINER_NAME) + docker stop $(APACHE_TIKA_CONTAINER_NAME) docker rm --force $(APACHE_TIKA_CONTAINER_NAME) .PHONY: apache-tika-server From 5e3ebedcab1b4ccebacd36a20daca2cb300313ad Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Tue, 19 Dec 2023 03:01:47 +0000 Subject: [PATCH 14/23] remove 'docker stop' line from stop-apache-tika-server --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index 27f5734..fe8ced5 100644 --- a/Makefile +++ b/Makefile @@ -130,7 +130,6 @@ start-apache-tika-server: java -jar /tika-server.jar stop-apache-tika-server: - docker stop $(APACHE_TIKA_CONTAINER_NAME) docker rm --force $(APACHE_TIKA_CONTAINER_NAME) .PHONY: apache-tika-server From 3c0ecdbde55e06896db5fa7b7a892e45c2b27ff1 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Tue, 19 Dec 2023 03:08:29 +0000 Subject: [PATCH 15/23] change podman to docker in --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fe8ced5..28d8645 100644 --- a/Makefile +++ b/Makefile @@ -232,7 +232,7 @@ shell-run: set-run-variable-values .PHONY: shell-database shell-database: set-run-variable-values - podman exec -it $(DATABASE_CONTAINER_NAME) \ + docker exec -it $(DATABASE_CONTAINER_NAME) \ psql -h localhost -d $(POSTGRES_DB) -U $(POSTGRES_USER) opensearch: stop-opensearch start-opensearch wait-opensearch From e13134459a1d5ce873b8b874ede448d8bc3458c3 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Fri, 22 Dec 2023 02:22:40 +0000 Subject: [PATCH 16/23] fix port publishing --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 28d8645..16a52b1 100644 --- a/Makefile +++ b/Makefile @@ -85,9 +85,9 @@ destroy-pod: docker rm --force $(POD_NAME) create-pod: destroy-pod - docker container run -d -p $(POSTGRES_PORT) \ - -p $(OPENSEARCH_PORT1) \ - -p $(STORAGE_PORT) \ + docker container run -d -p $(POSTGRES_PORT):$(POSTGRES_PORT) \ + -p $(OPENSEARCH_PORT1):$(OPENSEARCH_PORT1) \ + -p $(STORAGE_PORT):$(STORAGE_PORT) \ --name $(POD_NAME) \ $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) sleep 3600 From 2f0ea0793e5af5c4f536a44862e58f28aeb16b70 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Fri, 22 Dec 2023 02:31:26 +0000 Subject: [PATCH 17/23] change podman to docker in re-run --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 16a52b1..1d25575 100644 --- a/Makefile +++ b/Makefile @@ -213,8 +213,8 @@ setup: set-run-variable-values create-pod storage apache-tika-server opensearch .PHONY: re-run re-run: set-run-variable-values - podman run --rm -ti --volume $(PWD):/mnt/code:rw \ - --pod $(POD_NAME) \ + docker run --rm -ti --volume $(PWD):/mnt/code:rw \ + --network container:$(POD_NAME) \ --env PYTHONPATH=/mnt/code \ --env-file envvars \ $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) python main From ba1c9d8f2063e15bdca152f6b8ac49bb518f8fec Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Sat, 23 Dec 2023 05:44:00 +0000 Subject: [PATCH 18/23] added .env to be moved to querido-diario repo --- env_raspadores.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 env_raspadores.txt diff --git a/env_raspadores.txt b/env_raspadores.txt new file mode 100644 index 0000000..f3288f4 --- /dev/null +++ b/env_raspadores.txt @@ -0,0 +1,7 @@ +AWS_ACCESS_KEY_ID=minio-access-key +AWS_SECRET_ACCESS_KEY=minio-secret-key +AWS_ENDPOINT_URL=http://127.0.0.1:9000/ +AWS_REGION_NAME=us-east-1 +FILES_STORE=s3://queridodiariobucket/ +FILES_STORE_S3_ACL=public-read +QUERIDODIARIO_DATABASE_URL=postgresql+psycopg2://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb \ No newline at end of file From 86f5990c8a7ac46468fc003ffb4c1c47f7e6e9d5 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Fri, 29 Dec 2023 03:41:36 +0000 Subject: [PATCH 19/23] refactoring file type recognition --- data_extraction/text_extraction.py | 37 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/data_extraction/text_extraction.py b/data_extraction/text_extraction.py index 5ed87df..36667d0 100644 --- a/data_extraction/text_extraction.py +++ b/data_extraction/text_extraction.py @@ -22,11 +22,11 @@ def _return_file_content(self, filepath: str) -> str: with open(filepath, "r") as file: return file.read() - def _try_extract_text(self, filepath: str) -> str: - if self.is_txt(filepath): + def _try_extract_text(self, filepath: str, file_type) -> str: + if self.is_txt(file_type): return self._return_file_content(filepath) with open(filepath, "rb") as file: - headers = {"Content-Type": self._get_file_type(filepath)} + headers = {"Content-Type": file_type} response = requests.put(f"{self._url}/tika", data=file, headers=headers) response.encoding = "UTF-8" return response.text @@ -34,9 +34,10 @@ def _try_extract_text(self, filepath: str) -> str: def extract_text(self, filepath: str) -> str: logging.debug(f"Extracting text from {filepath}") self.check_file_exists(filepath) - self.check_file_type_supported(filepath) + file_type = self.get_file_type(filepath) + self.check_file_type_supported(file_type) try: - return self._try_extract_text(filepath) + return self._try_extract_text(filepath, file_type) except Exception as e: raise Exception("Could not extract file content") from e @@ -44,22 +45,22 @@ def check_file_exists(self, filepath: str): if not os.path.exists(filepath): raise Exception(f"File does not exists: {filepath}") - def check_file_type_supported(self, filepath: str) -> None: + def check_file_type_supported(self, found_type) -> None: if ( - not self.is_doc(filepath) - and not self.is_pdf(filepath) - and not self.is_txt(filepath) + not self.is_doc(found_type) + and not self.is_pdf(found_type) + and not self.is_txt(found_type) ): - raise Exception("Unsupported file type: " + self.get_file_type(filepath)) + raise Exception("Unsupported file type: " + found_type) - def is_pdf(self, filepath): + def is_pdf(self, found_type): """ If the file type is pdf returns True. Otherwise, returns False """ - return self.is_file_type(filepath, file_types=["application/pdf"]) + return self.is_file_type(found_type, file_types=["application/pdf"]) - def is_doc(self, filepath): + def is_doc(self, found_type): """ If the file type is doc or similar returns True. Otherwise, returns False @@ -69,14 +70,14 @@ def is_doc(self, filepath): "application/vnd.oasis.opendocument.text", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ] - return self.is_file_type(filepath, file_types) + return self.is_file_type(found_type, file_types) - def is_txt(self, filepath): + def is_txt(self, found_type): """ If the file type is txt returns True. Otherwise, returns False """ - return self.is_file_type(filepath, file_types=["text/plain"]) + return self.is_file_type(found_type, file_types=["text/plain"]) def get_file_type(self, filepath): """ @@ -84,11 +85,11 @@ def get_file_type(self, filepath): """ return magic.from_file(filepath, mime=True) - def is_file_type(self, filepath, file_types): + def is_file_type(self, found_type, file_types): """ Generic method to check if a identified file type matches a given list of types """ - return self.get_file_type(filepath) in file_types + return found_type in file_types def get_apache_tika_server_url(): From 347689dba7b9502056231398cd3a245ba3b494be Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Fri, 29 Dec 2023 22:42:52 +0000 Subject: [PATCH 20/23] more refactoring --- data_extraction/text_extraction.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_extraction/text_extraction.py b/data_extraction/text_extraction.py index 36667d0..79f661e 100644 --- a/data_extraction/text_extraction.py +++ b/data_extraction/text_extraction.py @@ -58,7 +58,7 @@ def is_pdf(self, found_type): If the file type is pdf returns True. Otherwise, returns False """ - return self.is_file_type(found_type, file_types=["application/pdf"]) + return found_type in ["application/pdf"] def is_doc(self, found_type): """ @@ -70,14 +70,14 @@ def is_doc(self, found_type): "application/vnd.oasis.opendocument.text", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ] - return self.is_file_type(found_type, file_types) + return found_type in file_types def is_txt(self, found_type): """ If the file type is txt returns True. Otherwise, returns False """ - return self.is_file_type(found_type, file_types=["text/plain"]) + return found_type in ["text/plain"] def get_file_type(self, filepath): """ From 977504a273957da1a34c597b9385b3afdc299cdd Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Sat, 30 Dec 2023 01:00:24 +0000 Subject: [PATCH 21/23] boolean fix for refresh_index --- index/opensearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/opensearch.py b/index/opensearch.py index 941e248..cad23ec 100644 --- a/index/opensearch.py +++ b/index/opensearch.py @@ -37,7 +37,7 @@ def create_index(self, index_name: str = "", body: Dict = {}) -> None: def refresh_index(self, index_name: str = "") -> None: index_name = self.get_index_name(index_name) - if self.index_exists(index_name): + if not self.index_exists(index_name): return self._search_engine.indices.refresh( index=index_name, From 94ae417e9801f0c53b98044c4569f9295c2a3ca2 Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Sat, 30 Dec 2023 03:28:41 +0000 Subject: [PATCH 22/23] Revert "boolean fix for refresh_index" This reverts commit 977504a273957da1a34c597b9385b3afdc299cdd. --- index/opensearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/opensearch.py b/index/opensearch.py index cad23ec..941e248 100644 --- a/index/opensearch.py +++ b/index/opensearch.py @@ -37,7 +37,7 @@ def create_index(self, index_name: str = "", body: Dict = {}) -> None: def refresh_index(self, index_name: str = "") -> None: index_name = self.get_index_name(index_name) - if not self.index_exists(index_name): + if self.index_exists(index_name): return self._search_engine.indices.refresh( index=index_name, From a4c7f185a317969633ae23757b42b597a5352e1a Mon Sep 17 00:00:00 2001 From: Ana-Sovat <38082922+Ana-Sovat@users.noreply.github.com> Date: Sat, 30 Dec 2023 03:32:36 +0000 Subject: [PATCH 23/23] revert changes made for developing environment --- Makefile | 44 ++++++++++++++++++++++---------------------- env_raspadores.txt | 7 ------- 2 files changed, 22 insertions(+), 29 deletions(-) delete mode 100644 env_raspadores.txt diff --git a/Makefile b/Makefile index 1d25575..34e510d 100644 --- a/Makefile +++ b/Makefile @@ -37,8 +37,8 @@ run-command=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ --env POSTGRES_PORT=$(POSTGRES_PORT) \ $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) $1) -wait-for=(docker run --rm -ti --volume $(PWD):/mnt/code:rw \ - --network container:$(POD_NAME) \ +wait-for=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ + --pod $(POD_NAME) \ --env PYTHONPATH=/mnt/code \ --env POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ --env POSTGRES_USER=$(POSTGRES_USER) \ @@ -56,12 +56,12 @@ black: .PHONY: build-devel build-devel: - docker build --tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) \ + podman build --tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) \ -f scripts/Dockerfile $(PWD) .PHONY: build-tika-server build-tika-server: - docker build --tag $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ + podman build --tag $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ -f scripts/Dockerfile_apache_tika $(PWD) .PHONY: build @@ -82,14 +82,13 @@ destroy: podman rmi --force $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) destroy-pod: - docker rm --force $(POD_NAME) + podman pod rm --force --ignore $(POD_NAME) create-pod: destroy-pod - docker container run -d -p $(POSTGRES_PORT):$(POSTGRES_PORT) \ + podman pod create -p $(POSTGRES_PORT):$(POSTGRES_PORT) \ -p $(OPENSEARCH_PORT1):$(OPENSEARCH_PORT1) \ -p $(STORAGE_PORT):$(STORAGE_PORT) \ - --name $(POD_NAME) \ - $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) sleep 3600 + --name $(POD_NAME) prepare-test-env: create-pod storage apache-tika-server opensearch database @@ -125,12 +124,13 @@ retest-tika: $(call run-command, python -m unittest -f tests/text_extraction_tests.py) start-apache-tika-server: - docker run -d --network container:$(POD_NAME) --name $(APACHE_TIKA_CONTAINER_NAME) \ + podman run -d --pod $(POD_NAME) --name $(APACHE_TIKA_CONTAINER_NAME) \ $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ java -jar /tika-server.jar stop-apache-tika-server: - docker rm --force $(APACHE_TIKA_CONTAINER_NAME) + podman stop --ignore $(APACHE_TIKA_CONTAINER_NAME) + podman rm --force --ignore $(APACHE_TIKA_CONTAINER_NAME) .PHONY: apache-tika-server apache-tika-server: stop-apache-tika-server start-apache-tika-server @@ -151,15 +151,15 @@ coverage: prepare-test-env .PHONY: stop-storage stop-storage: - docker rm --force $(STORAGE_CONTAINER_NAME) + podman rm --force --ignore $(STORAGE_CONTAINER_NAME) .PHONY: storage storage: stop-storage start-storage wait-storage start-storage: - docker run -d --rm -ti \ + podman run -d --rm -ti \ --name $(STORAGE_CONTAINER_NAME) \ - --network container:$(POD_NAME) \ + --pod $(POD_NAME) \ -e MINIO_ACCESS_KEY=$(STORAGE_ACCESS_KEY) \ -e MINIO_SECRET_KEY=$(STORAGE_ACCESS_SECRET) \ -e MINIO_DEFAULT_BUCKETS=$(STORAGE_BUCKET):public \ @@ -170,15 +170,15 @@ wait-storage: .PHONY: stop-database stop-database: - docker rm --force $(DATABASE_CONTAINER_NAME) + podman rm --force --ignore $(DATABASE_CONTAINER_NAME) .PHONY: database database: stop-database start-database wait-database start-database: - docker run -d --rm -ti \ + podman run -d --rm -ti \ --name $(DATABASE_CONTAINER_NAME) \ - --network container:$(POD_NAME) \ + --pod $(POD_NAME) \ -e POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ -e POSTGRES_USER=$(POSTGRES_USER) \ -e POSTGRES_DB=$(POSTGRES_DB) \ @@ -213,8 +213,8 @@ setup: set-run-variable-values create-pod storage apache-tika-server opensearch .PHONY: re-run re-run: set-run-variable-values - docker run --rm -ti --volume $(PWD):/mnt/code:rw \ - --network container:$(POD_NAME) \ + podman run --rm -ti --volume $(PWD):/mnt/code:rw \ + --pod $(POD_NAME) \ --env PYTHONPATH=/mnt/code \ --env-file envvars \ $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) python main @@ -232,21 +232,21 @@ shell-run: set-run-variable-values .PHONY: shell-database shell-database: set-run-variable-values - docker exec -it $(DATABASE_CONTAINER_NAME) \ + podman exec -it $(DATABASE_CONTAINER_NAME) \ psql -h localhost -d $(POSTGRES_DB) -U $(POSTGRES_USER) opensearch: stop-opensearch start-opensearch wait-opensearch start-opensearch: - docker run -d --rm -ti \ + podman run -d --rm -ti \ --name $(OPENSEARCH_CONTAINER_NAME) \ - --network container:$(POD_NAME) \ + --pod $(POD_NAME) \ --env discovery.type=single-node \ --env plugins.security.ssl.http.enabled=false \ docker.io/opensearchproject/opensearch:2.9.0 stop-opensearch: - docker rm --force $(OPENSEARCH_CONTAINER_NAME) + podman rm --force --ignore $(OPENSEARCH_CONTAINER_NAME) wait-opensearch: $(call wait-for, localhost:9200) diff --git a/env_raspadores.txt b/env_raspadores.txt deleted file mode 100644 index f3288f4..0000000 --- a/env_raspadores.txt +++ /dev/null @@ -1,7 +0,0 @@ -AWS_ACCESS_KEY_ID=minio-access-key -AWS_SECRET_ACCESS_KEY=minio-secret-key -AWS_ENDPOINT_URL=http://127.0.0.1:9000/ -AWS_REGION_NAME=us-east-1 -FILES_STORE=s3://queridodiariobucket/ -FILES_STORE_S3_ACL=public-read -QUERIDODIARIO_DATABASE_URL=postgresql+psycopg2://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb \ No newline at end of file