From a66007d81245d03b8cd7931111434471b135daf6 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 13 May 2024 17:59:32 -0400 Subject: [PATCH 01/44] Future comment for myself --- src/Annotation/StrawberryRunnersPostProcessor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Annotation/StrawberryRunnersPostProcessor.php b/src/Annotation/StrawberryRunnersPostProcessor.php index 1843810..1c05afb 100644 --- a/src/Annotation/StrawberryRunnersPostProcessor.php +++ b/src/Annotation/StrawberryRunnersPostProcessor.php @@ -63,7 +63,7 @@ class StrawberryRunnersPostProcessor extends Plugin { public $input_argument; /** - * Processing stage: can be Entity PreSave or PostSave + * Processing stage: can be Entity PreSave or PostSave. Pre save is good for ADO/Metadata. Implementation to follow. * * @var string $when; * From cde946d4ce12d89748eedc826cb4cc8897177538 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 13 May 2024 18:01:03 -0400 Subject: [PATCH 02/44] Don't pre-fetch the file if input_property != 'filepath' ML processors use a IIIF URL, so no need to download the file at all --- .../AbstractPostProcessorQueueWorker.php | 50 +++++++++++-------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index 5af6e46..cdd20ba 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -177,6 +177,7 @@ public function processItem($data) { // every processor will work only on Files. // True for now, but eventually we want processors that do only // metadata to metadata. + if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL || !is_array($data->metadata)) { return; } @@ -201,25 +202,30 @@ public function processItem($data) { return; } - $filelocation = $this->ensureFileAvailability($file); - - if ($filelocation === FALSE) { - $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing aborted for ADO Node ID @nodeid because we could not ensure a local file location needed for @processor. You might have run out space or have permission issues or (less likely) the original File/ADO was removed milliseconds ago.', - [ - '@processor' => $processor_instance->label(), - '@nodeid' => $data->nid, - ] - ); - // Note. If $filelocation could not be acquired, means we do not need to compost neither - // its already gone/not possible - return; + // We only need to ensure $file if we are going to use the actual file for processing. + if ($processor_instance-->getPluginDefinition()['input_property'] == 'filepath') { + $filelocation = $this->ensureFileAvailability($file); + if ($filelocation === FALSE) { + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing aborted for ADO Node ID @nodeid because we could not ensure a local file location needed for @processor. You might have run out space or have permission issues or (less likely) the original File/ADO was removed milliseconds ago.', + [ + '@processor' => $processor_instance->label(), + '@nodeid' => $data->nid, + ] + ); + // Note. If $filelocation could not be acquired, means we do not need to compost neither + // its already gone/not possible + return; + } + // Means we could pass also a file directly anytime. But not really as such + // only into $data->filepath but not into $filelocation bc + // that would compost and remove the file. What if its needed later? + $data->filepath = $filelocation; + // We preset it up here. + $this->instanceFiles = [$filelocation]; + } + else { + $data->filepath = NULL; } - // Means we could pass also a file directly anytime. But not really as such - // only into $data->filepath but not into $filelocation bc - // that would compost and remove the file. What if its needed later? - $data->filepath = $filelocation; - // We preset it up here. - $this->instanceFiles = [$filelocation]; if (!isset($processor_config['output_destination']) || !is_array($processor_config['output_destination'])) { $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing aborted for ADO Node ID @nodeid because there is no output destination setup for @processor', @@ -255,7 +261,7 @@ public function processItem($data) { // If not cleaned up before // AND won't chain in the future - $needs_localfile_cleanup = !$will_chain_future && !$data->sbr_cleanedup_before; + $needs_localfile_cleanup = !$will_chain_future && !$data->sbr_cleanedup_before && $processor_instance-->getPluginDefinition()['input_property'] == 'filepath'; // We set this before triggering cleanup, means future thinking // bc we need to make sure IF there is a next processor it will get // The info that during this queuworker processing cleanup at the end @@ -314,6 +320,8 @@ public function processItem($data) { if (is_a($entity, TranslatableInterface::class)) { $translations = $entity->getTranslationLanguages(); foreach ($translations as $translation_id => $translation) { + // checksum and file->uuid apply even if the source is not a local-ized/ensure local file. + // But we will have to change this if we plan on indexing JSON RAW directly as an vector embedding. $item_id = $entity->id() . ':' . $sequence_key . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; // a single 0 as return will force us to reindex. $inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes); @@ -479,8 +487,8 @@ public function processItem($data) { // The count will always be relative to this call // Means count of how many children are being called. $childdata->siblings = count($input_argument_value); - // In case the $input_property_value is an array coming from a plugin we may want to if has the same amount of values of $input_argument_value - // If so its many to one and we only need the corresponding entry to this sequence + // In case the $input_property_value is an array coming from a plugin we may want to know if it has the same amount of values of $input_argument_value + // If so, it is many to one, and we only need the corresponding entry to this sequence if ($input_property_value_from_plugin && is_array($input_property_value) && count($input_property_value) == $childdata->siblings && From 746210d8ccca0030ae67c5cb37692c19ce01131c Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 13 May 2024 18:01:23 -0400 Subject: [PATCH 03/44] Cleanup on OCR. Mostly comments/unreachable code --- .../StrawberryRunnersPostProcessor/OcrPostProcessor.php | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index cda3298..88f5d97 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -10,7 +10,6 @@ use Drupal\Core\Cache\CacheBackendInterface; use Drupal\Core\Form\FormStateInterface; -use Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor; use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface; use Drupal\strawberryfield\Plugin\search_api\datasource\StrawberryfieldFlavorDatasource; use Drupal\strawberry_runners\Web64\Nlp\NlpClient; @@ -344,7 +343,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $io->output = $output; } } - //if not searchable run try to load the ADO, check if there is a as:text HOCR with the same size + //if not searchable run try to load the ADO, check if there is an as:text HOCR with the same size //as the current Image and try to process, if not, run, tesseract $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; @@ -696,7 +695,6 @@ public function buildExecutableCommand(\stdClass $io) { // Only return $command if it contains the original filepath somewhere if (strpos($command, $file_path) !== FALSE) { return $command; - error_log($command); } return NULL; } From 88585a5f33d6c3c8abdbac71abc3c71174017555 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 13 May 2024 19:26:42 -0400 Subject: [PATCH 04/44] Allow Metadata only processing. Needs more code on the Queue itself --- src/strawberryRunnerUtilityService.php | 90 ++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/src/strawberryRunnerUtilityService.php b/src/strawberryRunnerUtilityService.php index ba76358..57ed391 100644 --- a/src/strawberryRunnerUtilityService.php +++ b/src/strawberryRunnerUtilityService.php @@ -158,7 +158,10 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf "weight" => "0" "configEntity" => "test" ]*/ + // @TODO document how an ADO processor config would look like. $askeymap = []; + $jmespathsmap = []; + //Plugins that run on Files attached to an ADO if (isset($active_plugins['entity:file'])) { foreach ($active_plugins['entity:file'] as $activePluginId => $config) { // Only add to $askeymap if $filter is empty or $activePluginId is in the $filter. @@ -172,6 +175,21 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf } } } + //Plugins that run on Metadata of an ADO + if (isset($active_plugins['entity:node'])) { + foreach ($active_plugins['entity:node'] as $activePluginId => $config) { + if (empty($filter) || in_array($activePluginId, $filter)) { + if ($config['source_type'] == 'ado') { + $jmespaths = array_filter($config['jmespath'] ?? []); + // This assumes we always need a jmespath. Review the idea of "all" without a jmespath + // which is really an *. But for now we prefer the "verbosity" of having one. + foreach ($jmespaths as $key => $value) { + $jmespathsmap[$key][$activePluginId] = $config; + } + } + } + } + } foreach ($sbf_fields as $field_name) { /* @var $field \Drupal\Core\Field\FieldItemInterface */ @@ -188,6 +206,7 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf if (isset($flatvalues['type'])) { $sbf_type = (array) $flatvalues['type']; } + // File level plugins $plugin_definition['input_type'] == entity:file foreach ($askeymap as $jsonkey => $activePlugins) { if (isset($flatvalues[$jsonkey])) { foreach ($flatvalues[$jsonkey] as $uniqueid => $asstructure) { @@ -298,6 +317,77 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf } } } + // JSON/Metadata level plugins coming from ADO JSON directly , $plugin_definition['input_type'] == entity:node + foreach ($jmespathsmap as $jmespath => $activePlugins) { + // evaluate the jmespath + $metadata_from_json = NULL; + if (is_string($jmespath)) { + $metadata_from_json = $itemfield->searchPath($jmespath); + } + if (!empty($metadata_from_json)) { + foreach ($activePlugins as $activePluginId => $config) { + // Checks if the flag is set and is an array. + $nopost = (isset($flatvalues["ap:tasks"]["ap:nopost"]) && + is_array($flatvalues["ap:tasks"]["ap:nopost"])); + + if ($nopost) { + if (in_array($activePluginId, $flatvalues["ap:tasks"]["ap:nopost"])) { + // if we have an entry like ["ap:tasks"]["ap:nopost"][0] == "pager" we don't run pager + // for this ADO. We won't delete existing ones. Just never process. + continue; + } + } + // @TODO how to avoid running on metadata generated by a processor? + // We should limit where that metadata goes. Should never be the same ADO? + $valid_ado_type = explode(',', $config['ado_type']); + $valid_ado_type = array_map('trim', $valid_ado_type); + if (empty($config['ado_type']) + || count( + array_intersect($valid_ado_type, $sbf_type) + ) > 0 + ) { + + $data = new \stdClass(); + $data->fid = NULL; + $data->nid = $entity->id(); + $data->nuuid = $entity->uuid(); + $data->field_name = $field_name; + $data->field_delta = $delta; + // There is no cleanup needed here, so we avoid even triggering the option + // by saying. This was cleaned up before. + $data->sbr_cleanedup_before = TRUE; + // Get the configured Language from descriptive metadata + if (isset($config['language_key']) + && !empty($config['language_key']) + && isset($flatvalues[$config['language_key']]) + ) { + $data->lang = is_array( + $flatvalues[$config['language_key']] + ) ? array_values($flatvalues[$config['language_key']]) + : [$flatvalues[$config['language_key']]]; + } + else { + $data->lang = $config['language_default'] ?? NULL; + } + // Check if there is a key that forces processing. + $force_from_metadata_or_arg + = isset($flatvalues["ap:tasks"]["ap:forcepost"]) + ? (bool) $flatvalues["ap:tasks"]["ap:forcepost"] + : $force; + // In this case metadata is the actual metadata as filtered by a JMESPATH. + // We double nest, so we can add a sibling key "checksum" + // NOTE: means when a processor needs to access metadata it will have to digg inside a "json" + // key. Same if it generates data for a nested processor. + $data->metadata = ["json" => $metadata_from_json, "checksum" => md5($metadata_from_json)]; + $data->force = $force_from_metadata_or_arg; + $data->plugin_config_entity_id = $activePluginId; + $this->queueFactory->get( + 'strawberryrunners_process_index', TRUE + )->createItem($data); + } + } + } + } } } } From 6a3df50077f0a8cb5b6aac6c1d5daf0c870d3165 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 13 May 2024 22:13:30 -0400 Subject: [PATCH 05/44] Focus diego! --- src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index cdd20ba..81f94a4 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -203,7 +203,7 @@ public function processItem($data) { } // We only need to ensure $file if we are going to use the actual file for processing. - if ($processor_instance-->getPluginDefinition()['input_property'] == 'filepath') { + if ($processor_instance->getPluginDefinition()['input_property'] == 'filepath') { $filelocation = $this->ensureFileAvailability($file); if ($filelocation === FALSE) { $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing aborted for ADO Node ID @nodeid because we could not ensure a local file location needed for @processor. You might have run out space or have permission issues or (less likely) the original File/ADO was removed milliseconds ago.', @@ -261,7 +261,7 @@ public function processItem($data) { // If not cleaned up before // AND won't chain in the future - $needs_localfile_cleanup = !$will_chain_future && !$data->sbr_cleanedup_before && $processor_instance-->getPluginDefinition()['input_property'] == 'filepath'; + $needs_localfile_cleanup = !$will_chain_future && !$data->sbr_cleanedup_before && $processor_instance->getPluginDefinition()['input_property'] == 'filepath'; // We set this before triggering cleanup, means future thinking // bc we need to make sure IF there is a next processor it will get // The info that during this queuworker processing cleanup at the end From 4d6baea2a557b568d643255adc1567b8b88cbc0a Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 14 May 2024 07:45:34 -0400 Subject: [PATCH 06/44] First pass on an abstract ML processor. Abstract bc it can't be used directly. Because each ML model is quite opinionated. Full of bugs, lacks of checks but it works. Will keep refining and trying to make it elegant before merging --- .../AbstractPostProcessorQueueWorker.php | 8 +- .../abstractMLPostProcessor.php | 418 ++++++++++++++++++ 2 files changed, 425 insertions(+), 1 deletion(-) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index 81f94a4..082f357 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -366,6 +366,7 @@ public function processItem($data) { // Check if $io->output exists? $toindex = new stdClass(); $toindex->fulltext = $io->output->searchapi['fulltext'] ?? ''; + $toindex->config_processor_id = $data->plugin_config_entity_id ?? ''; $toindex->plaintext = $io->output->searchapi['plaintext'] ?? ''; $toindex->metadata = $io->output->searchapi['metadata'] ?? []; $toindex->who = $io->output->searchapi['who'] ?? []; @@ -378,7 +379,12 @@ public function processItem($data) { $toindex->sentiment = $io->output->searchapi['sentiment'] ?? 0; $toindex->nlplang = $io->output->searchapi['nlplang'] ?? []; $toindex->processlang = $io->output->searchapi['processlang'] ?? []; - $toindex->config_processor_id = $data->plugin_config_entity_id ?? ''; + // ML ones. + $toindex->vector_384 = $io->output->searchapi['vector_384'] ?? NULL; + $toindex->vector_512 = $io->output->searchapi['vector_512'] ?? NULL; + $toindex->vector_576 = $io->output->searchapi['vector_576'] ?? NULL; + $toindex->vector_1024 = $io->output->searchapi['vector_1024'] ?? NULL; + $toindex->service_md5 = $io->output->searchapi['vector_1024'] ?? ''; // $siblings will be the amount of total children processors that were // enqueued for a single Processor chain. diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php new file mode 100644 index 0000000..f8d33cd --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -0,0 +1,418 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => 'yolov8', + 'iiif_server' => '', + ] + parent::defaultConfiguration(); + } + + + public function calculateDependencies() { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + // @TODO: Implement calculateDependencies() method. + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + 'ado' => 'ADO Strawberryfield JSON', + 'json' => 'JSON provided by another Processor' + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source data this processor needs is fetched'), + '#required' => TRUE, + ]; + + $element['ado_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('ADO type(s) to limit this processor to.'), + '#default_value' => $this->getConfiguration()['ado_type'], + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), + ]; + + $element['jsonkey'] = [ + '#type' => 'checkboxes', + '#title' => $this->t('The JSON key that contains the desired source.'), + '#options' => [ + 'as:image' => 'as:image', + 'as:document' => 'as:document', + 'as:audio' => 'as:audio', + 'as:video' => 'as:video', + 'as:text' => 'as:text', + 'as:application' => 'as:application', + ], + '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [], + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'], + ], + ], + ]; + + $element['jmespath'] = [ + '#type' => 'textfield', + '#title' => $this->t('Jmespath used to fetch/prefilter the metadata passed as JSON to the processor'), + '#default_value' => (!empty($this->getConfiguration()['jmespath']) && is_array($this->getConfiguration()['jmespath'])) ? $this->getConfiguration()['jmespath'] : [], + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[source_type]"]' => ['value' => 'ado'], + ':input[name="pluginconfig[source_type]"]' => ['value' => 'json'], + ], + ], + ]; + + $element['mime_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('Mimetypes(s) to limit this Processor to.'), + '#default_value' => $this->getConfiguration()['mime_type'], + '#description' => $this->t('A single Mimetype type or a comma separated list of mimetypes that qualify to be Processed. Leave empty to apply any file'), + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'], + ], + ], + ]; + + $element['language_key'] = [ + '#type' => 'textfield', + '#title' => $this->t("Within the ADO's metadata, the JSON key that contains the language in ISO639-3 (3 letter)"), + '#default_value' => (!empty($this->getConfiguration()['language_key'])) ? $this->getConfiguration()['language_key'] : '', + '#required' => TRUE, + ]; + + $element['language_default'] = [ + '#type' => 'textfield', + '#title' => $this->t("Please provide a default language in ISO639-3 (3 letter) format. If none is provided we will use 'eng' "), + '#default_value' => (!empty($this->getConfiguration()['language_default'])) ? $this->getConfiguration()['language_default'] : 'eng', + '#required' => TRUE, + ]; + + + $element['output_type'] = [ + '#type' => 'select', + '#title' => $this->t('The expected and desired output of this processor.'), + '#options' => [ + 'json' => 'Data/Values that can be serialized to JSON', + ], + '#default_value' => $this->getConfiguration()['output_type'], + '#description' => $this->t('ML processors only generate JSON'), + ]; + + $element['output_destination'] = [ + '#type' => 'checkboxes', + '#title' => $this->t("Where and how the output will be used."), + '#options' => [ + 'plugin' => 'As Input for another processor Plugin', + 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for ML Vector Comparison)', + ], + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#required' => TRUE, + ]; + + $element['processor_queue_type'] = [ + '#type' => 'select', + '#title' => $this->t('The queue to use for this processor.'), + '#options' => [ + 'background' => 'Secondary queue in background', + 'realtime' => 'Primary queue in realtime', + ], + '#default_value' => $this->getConfiguration()['processor_queue_type'], + '#description' => $this->t('The primary queue will be execute in realtime while the Secondary will be execute in background'), + '#required' => TRUE, + ]; + + $element['nlp_url'] = [ + '#type' => 'url', + '#title' => $this->t("The URL location of your NLP64/ML server."), + '#default_value' => $this->getConfiguration()['nlp_url'] ?? 'http://esmero-nlp:6400', + '#description' => t('Defaults to http://esmero-nlp:6400'), + '#required' => TRUE, + ]; + + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('Which ML endpoint to use'), + '#options' => [ + '/image/yolo' => 'yolov8 (Image Object detection (as MiniOCR Annotations) & embedding as a Unit Length Vector)', + '/image/mobilenet' => 'MobileNet (Image embeddings as a a Unit Length Vector)', + '/text/bert' => 'Bert (text embeddings as a Unit Length Vector)', + '/image/insightface' => 'InsightFace (Detection only as MiniOCR Annotations)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. Depending on the choice the actual value/size of data ingested will vary.'), + '#required' => TRUE, + ]; + + $element['iiif_server'] = [ + '#type' => 'url', + '#title' => $this->t('The IIIF Server to use for Image ML'), + '#default_value' => $this->getConfiguration()['iiif_server'] ?: \Drupal::service('config.factory') + ->get('format_strawberryfield.iiif_settings') + ->get('int_server_url'), + '#description' => $this->t('The IIIF Server to use. By default we will use the Internal (esmero-cantaloupe) endpoint'), + '#required' => TRUE, + ]; + + $element['timeout'] = [ + '#type' => 'number', + '#title' => $this->t('Timeout in seconds for this process.'), + '#default_value' => $this->getConfiguration()['timeout'], + '#description' => $this->t('If the process runs out of time it can still be processed again.'), + '#size' => 4, + '#maxlength' => 4, + '#min' => 1, + ]; + $element['weight'] = [ + '#type' => 'number', + '#title' => $this->t('Order or execution in the global chain.'), + '#default_value' => $this->getConfiguration()['weight'], + ]; + return $element; + } + + + + public function onDependencyRemoval(array $dependencies) { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + return parent::onDependencyRemoval( + $dependencies + ); // TODO: Change the autogenerated stub + } + + /** + * Executes the logic of this plugin given a file path and a context. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * @param string $context + */ + public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) { + $input_property = $this->pluginDefinition['input_property']; + $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; + $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; + + $config = $this->getConfiguration(); + $timeout = $config['timeout']; // in seconds + $output = new \stdClass(); + + if (!empty($config['nlp_url']) && !empty($config['ml_method'])) { + $nlp = new NlpClient($config['nlp_url']); + if ($nlp) { + $capabilities = $nlp->get_call('/status', NULL); + $languages_enabled = []; + $detected_lang = NULL; + //@TODO Should cache this too. Or deprecate ::language for 0.5.0 + if ($capabilities + && is_array($capabilities) + && is_array($capabilities['web64']['endpoints']) + && in_array($config['ml_method'], $capabilities['web64']['endpoints'])) { + + + if (in_array($config['source_type'], ['asstructure']) && isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { + $mloutput = $this->runImageMLfromIIIF($io, $nlp); + $io->output = $mloutput ?? $output;; + } + elseif (in_array($config['source_type'], ['ado', 'json']) && $node_uuid) { + $mloutput = $this->runTextMLfromMetadata($io, $nlp); + $io->output = $mloutput ?? $output; + } + else { + throw new \Exception("Invalid argument(s) for ML processor"); + } + } + else { + throw new \Exception("Your NLP/ML endpoint does not provide ". $config['ml_method'] . ' capabilities'); + } + } + else { + throw new \Exception("NLP/ML endpoint did not respond"); + } + } + else { + throw new \Exception("Missing ML Configuration(s) for ML processor"); + } + } + + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { + // This is an example. Each implementing class needs to deal with actual processing of output of the endpoint + $output = new \stdClass(); + $config = $this->getConfiguration(); + $input_argument = $this->pluginDefinition['input_argument']; + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} + $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; + $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; + if (!($width && $height)) { + $width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL; + $height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL; + } + + $ML = $nlpClient->get_call($config['ml_method'], [], 'en'); + $output->searchapi['plaintext'] = ''; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("ML Image Embeddings & Vectors") . ' ' . $sequence_number; + return $output; + } + + protected function runTextMLfromMetadata($io, NlpClient $nlpClient) { + $output = new \stdClass(); + $config = $this->getConfiguration(); + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + $output->searchapi['plaintext'] = ''; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("ML Text Embedding"); + return $output; + } + + protected function hOCRtoMiniOCR($output, $pageid) { + $hocr = simplexml_load_string($output); + $internalErrors = libxml_use_internal_errors(TRUE); + libxml_clear_errors(); + libxml_use_internal_errors($internalErrors); + if (!$hocr) { + return NULL; + } + $miniocr = new \XMLWriter(); + $miniocr->openMemory(); + $miniocr->startDocument('1.0', 'UTF-8'); + $miniocr->startElement("ocr"); + $atleastone_word = FALSE; + $pages = $hocr->body->children() ?? []; + foreach ($pages as $page) { + $titleparts = explode(';', $page['title']); + $pagetitle = NULL; + foreach ($titleparts as $titlepart) { + $titlepart = trim($titlepart ?? ''); + $title_pos = strpos($titlepart, 'bbox'); + // External/old HOCR might have more data before the bbox. + if ($title_pos !== FALSE) { + $pagetitle = substr($titlepart, $title_pos + 5); + } + } + if ($pagetitle == NULL) { + $miniocr->flush(); + return NULL; + } + $coos = explode(" ", $pagetitle); + // To avoid divisions by 0 + $pwidth = (float) $coos[2] ? (float) $coos[2] : 1; + $pheight = (float) $coos[3] ? (float) $coos[3] : 1; + // NOTE: floats are in the form of .1 so we need to remove the first 0. + if (count($coos)) { + $miniocr->startElement("p"); + $miniocr->writeAttribute("xml:id", 'sequence_' . $pageid); + $miniocr->writeAttribute("wh", + ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); + $miniocr->startElement("b"); + $page->registerXPathNamespace('ns', 'http://www.w3.org/1999/xhtml'); + foreach ($page->xpath('.//ns:span[@class="ocr_line"]') as $line) { + $notFirstWord = FALSE; + $miniocr->startElement("l"); + foreach ($line->children() as $word) { + $wcoos = explode(" ", $word['title']); + if (count($wcoos) >= 5) { + $x0 = (float) $wcoos[1]; + $y0 = (float) $wcoos[2]; + $x1 = (float) $wcoos[3]; + $y1 = (float) $wcoos[4]; + $l = ltrim(sprintf('%.3f', ($x0 / $pwidth)) ?? '', 0); + $t = ltrim(sprintf('%.3f', ($y0 / $pheight)) ?? '', 0); + $w = ltrim(sprintf('%.3f', (($x1 - $x0) / $pwidth)) ?? '', 0); + $h = ltrim(sprintf('%.3f', (($y1 - $y0) / $pheight)) ?? '', 0); + $text = (string) $word; + if ($notFirstWord) { + $miniocr->text(' '); + } + $notFirstWord = TRUE; + // New OCR Highlight does not like empty tags at all + if (strlen(trim($text ?? '')) > 0) { + $miniocr->startElement("w"); + $miniocr->writeAttribute("x", + $l . ' ' . $t . ' ' . $w . ' ' . $h); + $miniocr->text($text); + // Only assume we have at least one word for tags + // Since lines? could end empty? + $atleastone_word = TRUE; + $miniocr->endElement(); + } + } + } + $miniocr->endElement(); + } + $miniocr->endElement(); + $miniocr->endElement(); + } + } + $miniocr->endElement(); + $miniocr->endDocument(); + unset($hocr); + if ($atleastone_word) { + return $miniocr->outputMemory(TRUE); + } + else { + return StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; + } + } + + + // Mime types supported as input to Tesseract. + // See https://github.com/tesseract-ocr/tessdoc/blob/main/InputFormats.md + public function isImageMLMimeType($mime_type): bool { + $image_ML_mime_types = [ + 'image/png', + 'image/jpeg', + 'image/tiff', + 'image/jp2', + 'application/pdf', + ]; + return in_array($mime_type, $image_ML_mime_types); + } + +} From f81c5dd95ddec2930b965105f095cc662cb08625 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 14 May 2024 07:46:07 -0400 Subject: [PATCH 07/44] ML YOLO. Missing some $output keys and validations still Generates the proper embeddings and those end properly in Solr --- .../MLYoloPostProcessor.php | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php new file mode 100644 index 0000000..28bc4be --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -0,0 +1,97 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => '/image/yolov8', + ] + parent::defaultConfiguration(); + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + $element = parent::settingsForm($parents, $form_state); + return $element; + } + + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + $config = $this->getConfiguration(); + $input_argument = $this->pluginDefinition['input_argument']; + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} + $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; + $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; + if (!($width && $height)) { + $width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL; + $height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL; + } + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget( isset($io->input->metadata['url']) ? $io->input->metadata['url'] : NULL) + ); + + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return $output; + } + $arguments['iiif_image_url'] = $config['iiif_server']."/{$iiifidentifier}/full/full/0/default.jpg"; + $arguments['labels'] = []; + + $ML = $nlpClient->get_call($config['ml_method'], $arguments, 'en'); + $output->searchapi['vector_576'] = $ML['yolo']['vector'] ?? NULL; + $output->searchapi['service_md5'] = isset($ML['yolo']['modelinfo']) ? md5(json_encode($ML['yolo']['modelinfo'])) : NULL; + $output->searchapi['plaintext'] = ''; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("ML Image Embeddings & Vectors") . ' ' . $sequence_number; + return $output; + } + + +} From faf595c60db2648347ffe85a5267d37e911da908 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 14 May 2024 10:25:45 -0400 Subject: [PATCH 08/44] remove unused method --- .../abstractMLPostProcessor.php | 91 ------------------- 1 file changed, 91 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php index f8d33cd..b43406f 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -311,97 +311,6 @@ protected function runTextMLfromMetadata($io, NlpClient $nlpClient) { return $output; } - protected function hOCRtoMiniOCR($output, $pageid) { - $hocr = simplexml_load_string($output); - $internalErrors = libxml_use_internal_errors(TRUE); - libxml_clear_errors(); - libxml_use_internal_errors($internalErrors); - if (!$hocr) { - return NULL; - } - $miniocr = new \XMLWriter(); - $miniocr->openMemory(); - $miniocr->startDocument('1.0', 'UTF-8'); - $miniocr->startElement("ocr"); - $atleastone_word = FALSE; - $pages = $hocr->body->children() ?? []; - foreach ($pages as $page) { - $titleparts = explode(';', $page['title']); - $pagetitle = NULL; - foreach ($titleparts as $titlepart) { - $titlepart = trim($titlepart ?? ''); - $title_pos = strpos($titlepart, 'bbox'); - // External/old HOCR might have more data before the bbox. - if ($title_pos !== FALSE) { - $pagetitle = substr($titlepart, $title_pos + 5); - } - } - if ($pagetitle == NULL) { - $miniocr->flush(); - return NULL; - } - $coos = explode(" ", $pagetitle); - // To avoid divisions by 0 - $pwidth = (float) $coos[2] ? (float) $coos[2] : 1; - $pheight = (float) $coos[3] ? (float) $coos[3] : 1; - // NOTE: floats are in the form of .1 so we need to remove the first 0. - if (count($coos)) { - $miniocr->startElement("p"); - $miniocr->writeAttribute("xml:id", 'sequence_' . $pageid); - $miniocr->writeAttribute("wh", - ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); - $miniocr->startElement("b"); - $page->registerXPathNamespace('ns', 'http://www.w3.org/1999/xhtml'); - foreach ($page->xpath('.//ns:span[@class="ocr_line"]') as $line) { - $notFirstWord = FALSE; - $miniocr->startElement("l"); - foreach ($line->children() as $word) { - $wcoos = explode(" ", $word['title']); - if (count($wcoos) >= 5) { - $x0 = (float) $wcoos[1]; - $y0 = (float) $wcoos[2]; - $x1 = (float) $wcoos[3]; - $y1 = (float) $wcoos[4]; - $l = ltrim(sprintf('%.3f', ($x0 / $pwidth)) ?? '', 0); - $t = ltrim(sprintf('%.3f', ($y0 / $pheight)) ?? '', 0); - $w = ltrim(sprintf('%.3f', (($x1 - $x0) / $pwidth)) ?? '', 0); - $h = ltrim(sprintf('%.3f', (($y1 - $y0) / $pheight)) ?? '', 0); - $text = (string) $word; - if ($notFirstWord) { - $miniocr->text(' '); - } - $notFirstWord = TRUE; - // New OCR Highlight does not like empty tags at all - if (strlen(trim($text ?? '')) > 0) { - $miniocr->startElement("w"); - $miniocr->writeAttribute("x", - $l . ' ' . $t . ' ' . $w . ' ' . $h); - $miniocr->text($text); - // Only assume we have at least one word for tags - // Since lines? could end empty? - $atleastone_word = TRUE; - $miniocr->endElement(); - } - } - } - $miniocr->endElement(); - } - $miniocr->endElement(); - $miniocr->endElement(); - } - } - $miniocr->endElement(); - $miniocr->endDocument(); - unset($hocr); - if ($atleastone_word) { - return $miniocr->outputMemory(TRUE); - } - else { - return StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; - } - } - - // Mime types supported as input to Tesseract. // See https://github.com/tesseract-ocr/tessdoc/blob/main/InputFormats.md public function isImageMLMimeType($mime_type): bool { From 76f40a68fc5a2d22605b2c4ba5e97b6357540bb3 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 14 May 2024 10:26:07 -0400 Subject: [PATCH 09/44] process Object boxes and names as OCR with name as text + certainty --- .../MLYoloPostProcessor.php | 80 ++++++++++++++++++- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php index 28bc4be..70166c8 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -80,13 +80,32 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { if ($iiifidentifier == NULL || empty($iiifidentifier)) { return $output; } + //@TODO we know yolov8 takes 640px. We can pass just that to make it faster. + // But requires us to call info.json and pre-process the sizes. $arguments['iiif_image_url'] = $config['iiif_server']."/{$iiifidentifier}/full/full/0/default.jpg"; + //@TODO we are not filtering here by label yet. Next release. $arguments['labels'] = []; - + $page_text = NULL; + $output->plugin = NULL; + $labels = []; $ML = $nlpClient->get_call($config['ml_method'], $arguments, 'en'); - $output->searchapi['vector_576'] = $ML['yolo']['vector'] ?? NULL; + $output->searchapi['vector_576'] = is_array($ML['yolo']['vector']) && count($ML['yolo']['vector'])== 576 ? $ML['yolo']['vector'] : NULL; + if (is_array($ML['yolo']['objects']) && count($ML['yolo']['objects']) > 0 ) { + $miniocr = $this->yolotToMiniOCR($ML['yolo']['objects'], $width, $height, $sequence_number); + $output->searchapi['fulltext'] = $miniocr; + $output->plugin = $miniocr; + $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", + PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; + // What is a good confidence ratio here? + // based on the % of the bounding box? + // Just the value? + foreach($ML['yolo']['objects'] as $object) { + $labels[$object['name']] = $object['name']; + } + } + $output->searchapi['metadata'] = $labels; $output->searchapi['service_md5'] = isset($ML['yolo']['modelinfo']) ? md5(json_encode($ML['yolo']['modelinfo'])) : NULL; - $output->searchapi['plaintext'] = ''; + $output->searchapi['plaintext'] = $page_text ?? ''; $output->searchapi['processlang'] = $file_languages; $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("ML Image Embeddings & Vectors") . ' ' . $sequence_number; @@ -94,4 +113,59 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { } + protected function yolotToMiniOCR(array $objects, $width, $height, $pageid) { + $miniocr = new \XMLWriter(); + $miniocr->openMemory(); + $miniocr->startDocument('1.0', 'UTF-8'); + $miniocr->startElement("ocr"); + $atleastone_word = FALSE; + // To avoid divisions by 0 + $pwidth = (float) $width; + $pheight = (float) $height; + // NOTE: floats are in the form of .1 so we need to remove the first 0. + $miniocr->startElement("p"); + $miniocr->writeAttribute("xml:id", 'ml_yolo_' . $pageid); + $miniocr->writeAttribute("wh", + ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); + $miniocr->startElement("b"); + foreach ($objects as $object) { + $notFirstWord = FALSE; + $miniocr->startElement("l"); + $x0 = (float) $object['box']['x1']; + $y0 = (float) $object['box']['y1']; + $x1 = (float) $object['box']['x2']; + $y1 = (float) $object['box']['y2']; + $l = ltrim(sprintf('%.3f', $x0) ?? '', 0); + $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); + $w = ltrim(sprintf('%.3f', ($x1 - $x0)) ?? '', 0); + $h = ltrim(sprintf('%.3f', ($y1 - $y0)) ?? '', 0); + $text = (string) $object['name']?? 'Unlabeled' .' ~ '. $object['confidence']; + if ($notFirstWord) { + $miniocr->text(' '); + } + $notFirstWord = TRUE; + // New OCR Highlight does not like empty tags at all + if (strlen(trim($text ?? '')) > 0) { + $miniocr->startElement("w"); + $miniocr->writeAttribute("x", + $l . ' ' . $t . ' ' . $w . ' ' . $h); + $miniocr->text($text); + // Only assume we have at least one word for tags + // Since lines? could end empty? + $atleastone_word = TRUE; + $miniocr->endElement(); + } + $miniocr->endElement(); + } + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endDocument(); + if ($atleastone_word) { + return $miniocr->outputMemory(TRUE); + } + else { + return StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; + } + } } From 01e2fe6e020c6b8c6290ffccd8756b5d6df763ad Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 14 May 2024 16:06:40 -0400 Subject: [PATCH 10/44] Confidence goes with label detected --- .../StrawberryRunnersPostProcessor/MLYoloPostProcessor.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php index 70166c8..4eca135 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -89,8 +89,8 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { $output->plugin = NULL; $labels = []; $ML = $nlpClient->get_call($config['ml_method'], $arguments, 'en'); - $output->searchapi['vector_576'] = is_array($ML['yolo']['vector']) && count($ML['yolo']['vector'])== 576 ? $ML['yolo']['vector'] : NULL; - if (is_array($ML['yolo']['objects']) && count($ML['yolo']['objects']) > 0 ) { + $output->searchapi['vector_576'] = isset($ML['yolo']['vector']) && is_array($ML['yolo']['vector']) && count($ML['yolo']['vector'])== 576 ? $ML['yolo']['vector'] : NULL; + if (isset($ML['yolo']['objects']) && is_array($ML['yolo']['objects']) && count($ML['yolo']['objects']) > 0 ) { $miniocr = $this->yolotToMiniOCR($ML['yolo']['objects'], $width, $height, $sequence_number); $output->searchapi['fulltext'] = $miniocr; $output->plugin = $miniocr; @@ -139,7 +139,7 @@ protected function yolotToMiniOCR(array $objects, $width, $height, $pageid) { $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); $w = ltrim(sprintf('%.3f', ($x1 - $x0)) ?? '', 0); $h = ltrim(sprintf('%.3f', ($y1 - $y0)) ?? '', 0); - $text = (string) $object['name']?? 'Unlabeled' .' ~ '. $object['confidence']; + $text = (string) ($object['name'] ?? 'Unlabeled') .' ~ '. (string) ("{$object['confidence']}" ?? "0"); if ($notFirstWord) { $miniocr->text(' '); } From 73b9790f22de07c59ab50fad714baf226b7c6683 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Wed, 15 May 2024 20:50:35 -0400 Subject: [PATCH 11/44] Adds an ML Filter. Still Working on it. Does nothing yet But it will. Basically i can't pass an Image URL and a bounding box via GET (damn Drupal 10.2) but! i can gzip and base64 encode. I will that next. The idea here is that this filter can be fed via "ajax" by another view/formatter/plugin with a JSON structure that i will then decode here. I will check, if no bounding box was given, if i have a Vector for the same type configured in Solr and use that one, if not, i will call the corresponding API on the NLP container and the generate a vector on the fly (well like 3 seconds at least) and then alter the query to do a KNN... cool stuff here --- .../filter/StrawberryRunnersMLImagefilter.php | 413 ++++++++++++++++++ strawberry_runners.module | 46 +- 2 files changed, 453 insertions(+), 6 deletions(-) create mode 100644 src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php new file mode 100644 index 0000000..1ad3960 --- /dev/null +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -0,0 +1,413 @@ +setNodeStorage( + $container->get('entity_type.manager')->getStorage('node') + ); + $plugin->setFieldsHelper($container->get('search_api.fields_helper')); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setCache($container->get('cache.default')); + $plugin->currentUser = $container->get('current_user'); + return $plugin; + } + + + /** + * {@inheritdoc} + */ + public function defineOptions() { + $options = parent::defineOptions(); + $options['value']['default'] = []; + $options['sbf_fields'] = ['default' => []]; + return $options; + } + protected function canBuildGroup() { + return FALSE; + } + + /** + * {@inheritdoc} + */ + public function defaultExposeOptions() { + parent::defaultExposeOptions(); + $this->options['expose']['reduce'] = FALSE; + } + + protected function valueSubmit($form, FormStateInterface $form_state) { + $form_state = $form_state; + } + + /** + * Sets the Node Storage. + * + * @param \Drupal\node\NodeStorageInterface $nodestorage + * The node storage. + * + * @return $this + */ + + public function setNodeStorage(NodeStorageInterface $nodestorage) { + $this->nodeStorage = $nodestorage; + return $this; + } + + public function setFieldsHelper(FieldsHelperInterface $fieldsHelper) { + $this->fieldsHelper = $fieldsHelper; + return $this; + } + + /** + * Sets the View Storage. + * + * @param \Drupal\Core\Entity\EntityStorageInterface $viewstorage + * The view Storage. + * + * @return $this + */ + public function setViewStorage(EntityStorageInterface $viewstorage) { + $this->viewStorage = $viewstorage; + return $this; + } + + /** + * Sets the Cache Backed. + * + * @param \Drupal\Core\Cache\CacheBackendInterface $cache + * The cache backend. Use to store complex calculations of property paths. + * + * @return $this + */ + public function setCache(CacheBackendInterface $cache) { + $this->cache = $cache; + return $this; + } + + public function showOperatorForm(&$form, FormStateInterface $form_state) { + } + + /** + * {@inheritdoc} + */ + public function buildOptionsForm(&$form, FormStateInterface $form_state) { + parent::buildOptionsForm($form, $form_state); + + $fields = $this->getSbfDenseVectorFields() ?? []; + $form['sbf_fields'] = [ + '#type' => 'select', + '#title' => $this->t( + 'KNN Fields query against' + ), + '#description' => $this->t( + 'Select the fields that will be used to query against.' + ), + '#options' => $fields, + '#multiple' => FALSE, + '#default_value' => $this->options['sbf_fields'], + '#required' => TRUE, + ]; + $form['pre_query'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query'], + '#title' => $this->t('Treat previous filters to this as prequeries'), + '#description'=> $this->t( + 'If any other filter setup before this one will be treated as pre-queries to the actual KNN query.' + ), + ]; + $form['pre_query_facets'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query_facets'], + '#title' => $this->t('Treat also facets, if any, as prequeries'), + '#description'=> $this->t( + 'If any other facets will be treated as pre-queries to the actual KNN query.' + ), + ]; + } + + public function submitOptionsForm(&$form, FormStateInterface $form_state) { + parent::submitOptionsForm( + $form, $form_state + ); + } + + protected function valueForm(&$form, FormStateInterface $form_state) { + $this->value = is_array($this->value) ? $this->value : (array) $this->value; + if (!$form_state->get('exposed')) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('JSON used to query internal form'), + '#prefix' => '
', + '#suffix' => '
' + ]; + } + elseif ($this->isExposed()) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('JSON used to query public form'), + '#prefix' => '
', + '#suffix' => '
' + ] ; + } + } + + protected function valueValidate($form, FormStateInterface $form_state) { + $node_uuids = []; + if ($values = $form_state->getValue(['options', 'value'])) { + if (!is_array($values)) { (array) $values;} + foreach ($values as $value) { + $node_uuids_or_ids[] = $value; + } + sort($node_uuids_or_ids); + } + $form_state->setValue(['options', 'value'], $node_uuids_or_ids); + } + + public function hasExtraOptions() { + return FALSE; + } + + /** + * @inheritDoc + */ + protected function operatorForm(&$form, FormStateInterface $form_state) { + parent::operatorForm($form, $form_state); // TODO: Change the autogenerated stub + } + + + /** + * {@inheritdoc} + */ + public function buildExposeForm(&$form, FormStateInterface $form_state) { + parent::buildExposeForm($form, $form_state); + unset($form['expose']['reduce']); + } + + + public function query() { + if (empty($this->value)) { + return; + } + // Select boxes will always generate a single value. + // I could check here or cast sooner on validation? + if (!is_array($this->value)) { + $this->value = (array) $this->value; + } + + $query = $this->getQuery(); + + if (array_filter($this->value, 'is_numeric') === $this->value) { + $nodes = $this->value ? $this->nodeStorage->loadByProperties( + ['nid' => $this->value] + ) : []; + } + else { + $nodes = $this->value ? $this->nodeStorage->loadByProperties( + ['uuid' => $this->value] + ) : []; + } + return; + } + + + public function validate() { + + // For values passed by direct reference we will require/assume + // $json_for_url = base64_encode(gzcompress($json)); + // And this operation will happen on reading/setting back and forth. + $errors = parent::validate(); + if (is_array($this->value)) { + if ($this->options['exposed'] && !$this->options['expose']['required'] + && empty($this->value) + ) { + // Don't validate if the field is exposed and no default value is provided. + return $errors; + } + // Choose different kind of output for 0, a single and multiple values. + if (count($this->value) == 0) { + $errors[] = $this->t( + 'No valid values found on filter: @filter.', + ['@filter' => $this->adminLabel(TRUE)] + ); + } + } + return $errors; + } + + public function validateExposed(&$form, FormStateInterface $form_state) { + // Only validate exposed input. + if (empty($this->options['exposed']) + || empty($this->options['expose']['identifier']) + ) { + return; + } + // Exposed input for this filter is meant for power users. + // It will be a JSON with the following structure + /* + * { + * "iiif_image_id": "a IIIF id. We won't allow External Images to be used for searching for now.", + * "bbox": { + * "x": float, + * "y": float, + * "w": float, + * "w": float + * } + * } + * + */ + + $identifier = $this->options['expose']['identifier']; + $input = $form_state->getValue($identifier); + + $values = (array) $input; + if ($values) { + $this->validated_exposed_input = []; + } + } + + + public function acceptExposedInput($input) { + $rc = parent::acceptExposedInput($input); + + if ($rc) { + // If we have previously validated input, override. + if (isset($this->validated_exposed_input)) { + $this->value = $this->validated_exposed_input; + } + } + + return $rc; + } + + /** + * Retrieves a list of all fields that contain in its path a Node Entity. + * + * @return string[] + * An options list of field identifiers mapped to their prefixed + * labels. + */ + protected function getSbfDenseVectorFields() { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + + $fields_info = $index->getFields(); + foreach ($fields_info as $field_id => $field) { + //if (($field->getDatasourceId() == 'strawberryfield_flavor_datasource') && ($field->getType() == "integer")) { + // Anything except text, fulltext or any solr_text variations. Also skip direct node id and UUIDs which would + // basically return the same ADO as input filtered, given that those are unique. + $property_path = $field->getPropertyPath(); + $datasource_id = $field->getDatasourceId(); + if (str_starts_with($field->getType(), 'densevector_') === TRUE) { + $field->getDataDefinition(); + $fields[$field_id] = $field->getPrefixedLabel() . '('. $field->getFieldIdentifier() .')'; + } + } + return $fields; + } + + protected function getExistingDenseVectorForImage($uri, $field) { + + } +} diff --git a/strawberry_runners.module b/strawberry_runners.module index ff7bedb..1d77240 100644 --- a/strawberry_runners.module +++ b/strawberry_runners.module @@ -4,9 +4,43 @@ * Contains strawberryfield.module. */ -use Drupal\Core\Form\FormStateInterface; -use Drupal\Core\Url; -use Drupal\webform\WebformSubmissionForm; -use Drupal\Core\Entity\ContentEntityInterface; -use Drupal\file\Entity\File; -use Drupal\node\NodeInterface; +use Drupal\Core\Utility\Error as ErrorAlias; +use Drupal\search_api\Entity\Index; + + + +/** + * Implements hook_views_data_alter(). + */ +function strawberry_runners_views_data_alter(array &$data) { + //@see search_api_views_data() + /** @var \Drupal\search_api\IndexInterface $index */ + foreach (Index::loadMultiple() as $index) { + try { + $key = 'search_api_index_' . $index->id(); + $table = &$data[$key]; + + $ml_image_filter = _search_api_views_find_field_alias('sbr_imageml_filter', $table); + $table[$ml_image_filter] = [ + 'title' => t('Image Similarity Filter via KNN (Experimental)'), + 'group' => t('Search'), + 'help' => t('Filters one or more Images belonging to an ADO against the Corresponding Vector in a Strawberry Flavor Document generating on the Fly an Embedding Vector.'), + 'filter' => [ + 'title' => t('Image Similarity Filter via KNN '), + 'field' => 'id', + 'id' => 'sbr_imageml_filter', + ], + ]; + if ($ml_image_filter != 'sbr_imageml_filter') { + $table[$ml_image_filter]['real field'] = 'sbr_imageml_filter'; + } + } + catch (\Exception $e) { + $args = [ + '%index' => $index->label(), + ]; + ErrorAlias::logException('strawberry_runners', $e, '%type while computing Views data for index %index: @message in %function (line %line of %file).', $args); + } + } + return $data; +} From f478f2729f4398f165daaf6e53381bce8e1d0a47 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 17 May 2024 14:10:01 -0400 Subject: [PATCH 12/44] Clean up Abstract ML processor and add public methods/constants Basically preparing for re-use inside a Views Filter. That way we can ensure we search with the same ML model/vector size that generated the "against what we are searching for" --- .../abstractMLPostProcessor.php | 75 ++++++++++--------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php index b43406f..463cd5a 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -39,6 +39,17 @@ public function defaultConfiguration() { ] + parent::defaultConfiguration(); } + public const ML_IMAGE_VECTOR_SIZE = [ + '/image/yolo' => 576, + '/image/mobilenet' => 1024, + '/image/insightfacet' => 512, + ]; + + public const ML_TEXT_VECTOR_SIZE = [ + '/text/bert' => 384, + ]; + + protected $nlp_client = null; public function calculateDependencies() { // Since Processors could be chained we need to check if any other @@ -239,7 +250,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $output = new \stdClass(); if (!empty($config['nlp_url']) && !empty($config['ml_method'])) { - $nlp = new NlpClient($config['nlp_url']); + $nlp = $this->getNLPClient(); if ($nlp) { $capabilities = $nlp->get_call('/status', NULL); $languages_enabled = []; @@ -276,40 +287,9 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } } - protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { - // This is an example. Each implementing class needs to deal with actual processing of output of the endpoint - $output = new \stdClass(); - $config = $this->getConfiguration(); - $input_argument = $this->pluginDefinition['input_argument']; - $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; - // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} - $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; - setlocale(LC_CTYPE, 'en_US.UTF-8'); - $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; - $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; - if (!($width && $height)) { - $width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL; - $height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL; - } + abstract protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass; - $ML = $nlpClient->get_call($config['ml_method'], [], 'en'); - $output->searchapi['plaintext'] = ''; - $output->searchapi['processlang'] = $file_languages; - $output->searchapi['ts'] = date("c"); - $output->searchapi['label'] = $this->t("ML Image Embeddings & Vectors") . ' ' . $sequence_number; - return $output; - } - - protected function runTextMLfromMetadata($io, NlpClient $nlpClient) { - $output = new \stdClass(); - $config = $this->getConfiguration(); - $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; - $output->searchapi['plaintext'] = ''; - $output->searchapi['processlang'] = $file_languages; - $output->searchapi['ts'] = date("c"); - $output->searchapi['label'] = $this->t("ML Text Embedding"); - return $output; - } + abstract protected function runTextMLfromMetadata($io, NlpClient $nlpClient) :\stdClass; // Mime types supported as input to Tesseract. // See https://github.com/tesseract-ocr/tessdoc/blob/main/InputFormats.md @@ -324,4 +304,31 @@ public function isImageMLMimeType($mime_type): bool { return in_array($mime_type, $image_ML_mime_types); } + public function getVectorMLInfo() { + $config = $this->getConfiguration(); + $info = [ + 'nlp_url' => $config['nlp_url'], + 'ml_method' => $config['ml_method'], + 'iiif_server' => $config['iiif_server'], + ]; + } + + public function callNlPwithArguments() { + return []; + } + + protected function getNLPClient() { + if ($this->nlp_client) { + return $this->nlp_client; + } + else { + $config = $this->getConfiguration(); + $nlp = new NlpClient($config['nlp_url']); + $this->nlp_client = $nlp; + return $this->nlp_client; + } + } + + + } From 3645846b31c7c0d095810ef88672d159cec8b393 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 17 May 2024 14:11:00 -0400 Subject: [PATCH 13/44] Clean up YOLO processor (more to come) and expand the interface public methods of the runnerUtility Service (just bc it aids in autocompleting/validating/overriding if needed/ever in the future) --- .../MLYoloPostProcessor.php | 24 +++++++++++++++---- ...trawberryRunnerUtilityServiceInterface.php | 9 +++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php index 4eca135..ea35ec2 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -19,7 +19,6 @@ use Drupal\strawberry_runners\Web64\Nlp\NlpClient; use Laracasts\Transcriptions\Transcription; - /** * * ML YOLO @@ -59,6 +58,12 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { return $element; } + protected function runTextMLfromMetadata($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + return $output; + // TODO: Implement runTextMLfromMetadata() method. + } + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { $output = new \stdClass(); $config = $this->getConfiguration(); @@ -82,13 +87,13 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { } //@TODO we know yolov8 takes 640px. We can pass just that to make it faster. // But requires us to call info.json and pre-process the sizes. - $arguments['iiif_image_url'] = $config['iiif_server']."/{$iiifidentifier}/full/full/0/default.jpg"; + $iiif_image_url = $config['iiif_server']."/{$iiifidentifier}/full/full/0/default.jpg"; //@TODO we are not filtering here by label yet. Next release. - $arguments['labels'] = []; + $labels = []; $page_text = NULL; $output->plugin = NULL; $labels = []; - $ML = $nlpClient->get_call($config['ml_method'], $arguments, 'en'); + $ML = callImageML($iiif_image_url,$labels); $output->searchapi['vector_576'] = isset($ML['yolo']['vector']) && is_array($ML['yolo']['vector']) && count($ML['yolo']['vector'])== 576 ? $ML['yolo']['vector'] : NULL; if (isset($ML['yolo']['objects']) && is_array($ML['yolo']['objects']) && count($ML['yolo']['objects']) > 0 ) { $miniocr = $this->yolotToMiniOCR($ML['yolo']['objects'], $width, $height, $sequence_number); @@ -168,4 +173,15 @@ protected function yolotToMiniOCR(array $objects, $width, $height, $pageid) { return StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; } } + + public function callImageML($image_url, $labels) { + $nlpClient = $this->getNLPClient(); + $config = $this->getConfiguration(); + $arguments['iiif_image_url'] = $image_url; + //@TODO we are not filtering here by label yet. Next release. + $arguments['labels'] = $labels; + $ML = $nlpClient->get_call($config['ml_method'], $arguments, 1); + return $ML; + } + } diff --git a/src/strawberryRunnerUtilityServiceInterface.php b/src/strawberryRunnerUtilityServiceInterface.php index 8981c43..50911c0 100644 --- a/src/strawberryRunnerUtilityServiceInterface.php +++ b/src/strawberryRunnerUtilityServiceInterface.php @@ -25,4 +25,13 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf_fields, bool $force = FALSE, array $filter = [] ): void; + /** + * Gets all Currently Active PLugin Entities and Configs initialized + * + * + * @return array + * @throws \Drupal\Component\Plugin\Exception\PluginException + */ + public function getActivePluginConfigs():array; + } From 23da661dca03c9a9fd55543547d7ad3af86b123a Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 17 May 2024 14:11:15 -0400 Subject: [PATCH 14/44] Not close to ready, but better commit now than be sorry later --- .../filter/StrawberryRunnersMLImagefilter.php | 73 ++++++++++++++++++- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 1ad3960..1d157fb 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -42,8 +42,6 @@ class StrawberryRunnersMLImagefilter extends FilterPluginBase /* FilterPluginBas use SearchApiFilterTrait; - - protected $alwaysMultiple = TRUE; public $no_operator = TRUE; @@ -90,6 +88,13 @@ class StrawberryRunnersMLImagefilter extends FilterPluginBase /* FilterPluginBas */ protected $cache; + /** + * The Strawberry Runners Utility Service. + * + * @var \Drupal\strawberry_runners\strawberryRunnerUtilityServiceInterface + */ + private $strawberryRunnerUtilityService; + /** * {@inheritdoc} @@ -111,6 +116,9 @@ public static function create(ContainerInterface $container, ); $plugin->setCache($container->get('cache.default')); $plugin->currentUser = $container->get('current_user'); + $plugin->strawberryRunnerUtilityService = $container->get( + 'strawberry_runner.utility' + ); return $plugin; } @@ -194,6 +202,10 @@ public function showOperatorForm(&$form, FormStateInterface $form_state) { public function buildOptionsForm(&$form, FormStateInterface $form_state) { parent::buildOptionsForm($form, $form_state); + $active_plugins = $this->strawberryRunnerUtilityService->getActivePluginConfigs(); + + + $fields = $this->getSbfDenseVectorFields() ?? []; $form['sbf_fields'] = [ '#type' => 'select', @@ -224,6 +236,10 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { 'If any other facets will be treated as pre-queries to the actual KNN query.' ), ]; + $form['ml_strawberry_postprocessor'] = [ + + ]; + } public function submitOptionsForm(&$form, FormStateInterface $form_state) { @@ -233,6 +249,8 @@ public function submitOptionsForm(&$form, FormStateInterface $form_state) { } protected function valueForm(&$form, FormStateInterface $form_state) { + // At this stage $this->value is not set? + $this->value = is_array($this->value) ? $this->value : (array) $this->value; if (!$form_state->get('exposed')) { $form['value'] = [ @@ -337,6 +355,8 @@ public function validate() { public function validateExposed(&$form, FormStateInterface $form_state) { // Only validate exposed input. + // In theory this is where i can alter the actual form state input + // to set a different URL argument? compress? if (empty($this->options['exposed']) || empty($this->options['expose']['identifier']) ) { @@ -362,14 +382,45 @@ public function validateExposed(&$form, FormStateInterface $form_state) { $values = (array) $input; if ($values) { - $this->validated_exposed_input = []; + if ($this->isExposed()) { + // If already JSON + $json_input = json_decode($values[0] ?? ''); + if ($json_input !== JSON_ERROR_NONE) { + // Probably not the place to compress the data for the URL? + $encoded = base64_encode(gzcompress($values[0])); + $form_state->setValue($identifier, $encoded); + $input = $form_state->getUserInput(); + $input[$identifier] = $encoded; + $form_state->setUserInput($input); + $this->validated_exposed_input = $json_input; + $filter_input = $this->view->getExposedInput(); + $filter_input[$identifier] = $encoded; + $this->view->setExposedInput($filter_input); + } + else { + // check if base64 encoded then + if ($this->is_base64()) { + + $decoded = gzdecode(base64_decode($values[0])); + if ($decoded !== FALSE) { + $json_input = json_decode($values[0] ?? ''); + + } + } + } + } + else { + + } } } public function acceptExposedInput($input) { + // Called during the form submit itself.. $rc = parent::acceptExposedInput($input); - + // a False means it won't be included/alter the generated query. + // This is useful! if ($rc) { // If we have previously validated input, override. if (isset($this->validated_exposed_input)) { @@ -410,4 +461,18 @@ protected function getSbfDenseVectorFields() { protected function getExistingDenseVectorForImage($uri, $field) { } + + protected function is_base64($s){ + // Check if there are valid base64 characters + if (!preg_match('/^[a-zA-Z0-9\/\r\n+]*={0,2}$/', $s)) return false; + + // Decode the string in strict mode and check the results + $decoded = base64_decode($s, true); + if(false === $decoded) return false; + + // Encode the string again + if(base64_encode($decoded) != $s) return false; + + return true; + } } From d1351f9e776064f1ca2ca0f94652170476a5f110 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 17 May 2024 15:33:08 -0400 Subject: [PATCH 15/44] Allow the runners service to provide (not default) also a list of plugins that are not "top level"... because we might end needing deeper level listings for external access of the method --- src/strawberryRunnerUtilityService.php | 12 +++++++----- src/strawberryRunnerUtilityServiceInterface.php | 5 +++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/strawberryRunnerUtilityService.php b/src/strawberryRunnerUtilityService.php index 57ed391..7cebdec 100644 --- a/src/strawberryRunnerUtilityService.php +++ b/src/strawberryRunnerUtilityService.php @@ -2,6 +2,7 @@ namespace Drupal\strawberry_runners; +use Drupal\Component\Plugin\Exception\PluginException; use Drupal\Core\Config\ConfigFactoryInterface; use Drupal\Core\Queue\QueueFactory; use Drupal\Core\Entity\ContentEntityInterface; @@ -394,13 +395,14 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf } /** - * Gets all Currently Active PLugin Entities and Configs initialized - * + * Gets all Currently Active Plugin Entities and Configs initialized * + * @param bool $onlyRoot + * TRUE means we only get Top/first call Processors. FALSE, any processor at any level. * @return array - * @throws \Drupal\Component\Plugin\Exception\PluginException + * @throws PluginException */ - public function getActivePluginConfigs():array { + public function getActivePluginConfigs($onlyRoot = TRUE):array { $active_plugins = []; /* @var $plugin_config_entities \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity[] */ $plugin_config_entities = $this->entityTypeManager->getListBuilder( @@ -410,7 +412,7 @@ public function getActivePluginConfigs():array { foreach ($plugin_config_entities as $plugin_config_entity) { // Only get first level (no Parents) and Active ones. if ($plugin_config_entity->isActive() - && $plugin_config_entity->getParent() == '' + && (($onlyRoot && $plugin_config_entity->getParent() == '') || (!$onlyRoot)) ) { $entity_id = $plugin_config_entity->id(); $configuration_options = $plugin_config_entity->getPluginconfig(); diff --git a/src/strawberryRunnerUtilityServiceInterface.php b/src/strawberryRunnerUtilityServiceInterface.php index 50911c0..ca90e20 100644 --- a/src/strawberryRunnerUtilityServiceInterface.php +++ b/src/strawberryRunnerUtilityServiceInterface.php @@ -28,10 +28,11 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, /** * Gets all Currently Active PLugin Entities and Configs initialized * - * + * @param bool $onlyRoot + * TRUE means we only get Top/first call Processors. FALSE, any processor at any level. * @return array * @throws \Drupal\Component\Plugin\Exception\PluginException */ - public function getActivePluginConfigs():array; + public function getActivePluginConfigs($onlyRoot = TRUE):array; } From 21a1b51fda2e265b8300ec003f49916c94f7f3b0 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 17 May 2024 15:34:27 -0400 Subject: [PATCH 16/44] Overly complex validation of configured field v/s Processor Because vectors are so specific. And we don't want white screens of sorts like e.g "solr died" because it felt lonely, tired who knows. --- .../filter/StrawberryRunnersMLImagefilter.php | 182 ++++++++++++------ 1 file changed, 128 insertions(+), 54 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 1d157fb..c10559b 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -4,6 +4,7 @@ use Drupal\Core\Cache\CacheableMetadata; use Drupal\Core\Cache\RefinableCacheableDependencyInterface; +use Drupal\Core\Entity\EntityTypeManagerInterface; use Drupal\Core\Entity\TypedData\EntityDataDefinitionInterface; use Drupal\Core\Field\TypedData\FieldItemDataDefinitionInterface; use Drupal\Core\Form\FormStateInterface; @@ -54,11 +55,11 @@ class StrawberryRunnersMLImagefilter extends FilterPluginBase /* FilterPluginBas public $validated_exposed_input = NULL; /** - * The vocabulary storage. + * The Entity Type manager * - * @var \Drupal\node\NodeStorageInterface + * @var \Drupal\Core\Entity\EntityStorageInterface */ - protected $nodeStorage; + protected $sbrEntityStorage; /** * The vocabulary storage. @@ -95,30 +96,42 @@ class StrawberryRunnersMLImagefilter extends FilterPluginBase /* FilterPluginBas */ private $strawberryRunnerUtilityService; + /** + * The StrawberryRunner Processor Plugin Manager. + * + * @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager + */ + private $strawberryRunnerProcessorPluginManager; /** * {@inheritdoc} */ public static function create(ContainerInterface $container, - array $configuration, $plugin_id, $plugin_definition + array $configuration, $plugin_id, $plugin_definition ) { /** @var static $plugin */ $plugin = parent::create( $container, $configuration, $plugin_id, $plugin_definition ); - $plugin->setNodeStorage( - $container->get('entity_type.manager')->getStorage('node') + $plugin->setSbrEntityStorage( + $container->get('entity_type.manager')->getStorage('strawberry_runners_postprocessor') ); $plugin->setFieldsHelper($container->get('search_api.fields_helper')); $plugin->setViewStorage( $container->get('entity_type.manager')->getStorage('view') ); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); $plugin->setCache($container->get('cache.default')); $plugin->currentUser = $container->get('current_user'); $plugin->strawberryRunnerUtilityService = $container->get( 'strawberry_runner.utility' ); + $plugin->strawberryRunnerProcessorPluginManager = $container->get( + 'strawberry_runner.processor_manager' + ); return $plugin; } @@ -132,6 +145,13 @@ public function defineOptions() { $options['sbf_fields'] = ['default' => []]; return $options; } + + public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage): StrawberryRunnersMLImagefilter + { + $this->sbrEntityStorage = $sbrEntityStorage; + return $this; + } + protected function canBuildGroup() { return FALSE; } @@ -148,19 +168,6 @@ protected function valueSubmit($form, FormStateInterface $form_state) { $form_state = $form_state; } - /** - * Sets the Node Storage. - * - * @param \Drupal\node\NodeStorageInterface $nodestorage - * The node storage. - * - * @return $this - */ - - public function setNodeStorage(NodeStorageInterface $nodestorage) { - $this->nodeStorage = $nodestorage; - return $this; - } public function setFieldsHelper(FieldsHelperInterface $fieldsHelper) { $this->fieldsHelper = $fieldsHelper; @@ -201,8 +208,15 @@ public function showOperatorForm(&$form, FormStateInterface $form_state) { */ public function buildOptionsForm(&$form, FormStateInterface $form_state) { parent::buildOptionsForm($form, $form_state); + $active_plugins = $this->strawberryRunnerUtilityService->getActivePluginConfigs(FALSE); - $active_plugins = $this->strawberryRunnerUtilityService->getActivePluginConfigs(); + foreach ($active_plugins as $by_source => $plugins) { + foreach ($plugins as $entity_id => $active_plugin) { + if (isset($active_plugin['ml_method'])) { + $post_processor_options[$entity_id] = $active_plugin['ml_method'] ."({$entity_id})"; + } + } + } @@ -236,10 +250,66 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { 'If any other facets will be treated as pre-queries to the actual KNN query.' ), ]; - $form['ml_strawberry_postprocessor'] = [ - + $form['ml_strawberry_postprocessor'] = [ + '#type' => 'select', + '#title' => $this->t( + 'Strawberry Runners processor to extract the on-the fly embedding' + ), + '#description' => $this->t( + 'Select the ML Strawberry Runners Processor that was used to index Vectors into the field you are going to search against. These need to match' + ), + '#options' => $post_processor_options, + '#multiple' => FALSE, + '#default_value' => $this->options['ml_strawberry_postprocessor'], + '#required' => TRUE, ]; + } + /** + * Validate the options form. + */ + public function validateOptionsForm(&$form, FormStateInterface $form_state) { + // We need to validate that the selected field is of the same source/size as model that will + // be used to generate the on the fly vectors. + // So we need to load the SBR entity passed, compare the model against the constant present in + // \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE + // and then load the field and see if the source (is of the same SBFlavor property/size (vector_576, etc) + $valid = FALSE; + $options = $form_state->getValue('options'); + $processor_id = $options['ml_strawberry_postprocessor'] ?? NULL; + if ($processor_id == NULL) { + // Can't validate yet here.Probably being setup by the user still. + return; + } + $field_id = $options['sbf_fields']; + if ($processor_id) { + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $config = $plugin_config_entity->getPluginconfig(); + // Note, we could also restrict to the same image mimetypes that the processor is setup to handle? + if (isset($config['ml_method'])) { + $vector_size = \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$config['ml_method']] ?? ''; + $field_info = $this->getSbfDenseVectorFieldSource($field_id); + if ($field_info) { + // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. + $propath_pieces = explode('/', $field_info->getCombinedPropertyPath()); + if (end($propath_pieces) == 'vector_' .$vector_size && $field_info->getType() == 'densevector_' . $vector_size) { + $valid = TRUE; + } + else { + $form_state->setErrorByName('ml_strawberry_postprocessor', $this->t('The Field/Processor Combination is not right. Make sure your Vector Field and processor are targeting the same Vector Dimensions')); + } + } + else { + // The field is gone. + $form_state->setErrorByName('sbf_fields', $this->t('Configured Dense Vector field does not longer exists.')); + } + } + } + } + if ($valid) { + } } public function submitOptionsForm(&$form, FormStateInterface $form_state) { @@ -252,22 +322,22 @@ protected function valueForm(&$form, FormStateInterface $form_state) { // At this stage $this->value is not set? $this->value = is_array($this->value) ? $this->value : (array) $this->value; - if (!$form_state->get('exposed')) { - $form['value'] = [ - '#type' => 'textarea', - '#title' => t('JSON used to query internal form'), - '#prefix' => '
', - '#suffix' => '
' - ]; - } - elseif ($this->isExposed()) { - $form['value'] = [ - '#type' => 'textarea', - '#title' => t('JSON used to query public form'), - '#prefix' => '
', - '#suffix' => '
' - ] ; - } + if (!$form_state->get('exposed')) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('JSON used to query internal form'), + '#prefix' => '
', + '#suffix' => '
' + ]; + } + elseif ($this->isExposed()) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('JSON used to query public form'), + '#prefix' => '
', + '#suffix' => '
' + ] ; + } } protected function valueValidate($form, FormStateInterface $form_state) { @@ -316,14 +386,10 @@ public function query() { $query = $this->getQuery(); if (array_filter($this->value, 'is_numeric') === $this->value) { - $nodes = $this->value ? $this->nodeStorage->loadByProperties( - ['nid' => $this->value] - ) : []; + } else { - $nodes = $this->value ? $this->nodeStorage->loadByProperties( - ['uuid' => $this->value] - ) : []; + } return; } @@ -387,15 +453,15 @@ public function validateExposed(&$form, FormStateInterface $form_state) { $json_input = json_decode($values[0] ?? ''); if ($json_input !== JSON_ERROR_NONE) { // Probably not the place to compress the data for the URL? - $encoded = base64_encode(gzcompress($values[0])); - $form_state->setValue($identifier, $encoded); - $input = $form_state->getUserInput(); - $input[$identifier] = $encoded; - $form_state->setUserInput($input); - $this->validated_exposed_input = $json_input; - $filter_input = $this->view->getExposedInput(); - $filter_input[$identifier] = $encoded; - $this->view->setExposedInput($filter_input); + $encoded = base64_encode(gzcompress($values[0])); + $form_state->setValue($identifier, $encoded); + $input = $form_state->getUserInput(); + $input[$identifier] = $encoded; + $form_state->setUserInput($input); + $this->validated_exposed_input = $json_input; + $filter_input = $this->view->getExposedInput(); + $filter_input[$identifier] = $encoded; + $this->view->setExposedInput($filter_input); } else { // check if base64 encoded then @@ -453,11 +519,19 @@ protected function getSbfDenseVectorFields() { if (str_starts_with($field->getType(), 'densevector_') === TRUE) { $field->getDataDefinition(); $fields[$field_id] = $field->getPrefixedLabel() . '('. $field->getFieldIdentifier() .')'; - } + } } return $fields; } + protected function getSbfDenseVectorFieldSource($field_id) { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + $fields_info = $index->getField($field_id); + return $fields_info; + } + protected function getExistingDenseVectorForImage($uri, $field) { } From 73e09290e6b952eaf5c0a23ac82387435464978f Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 17 May 2024 15:41:43 -0400 Subject: [PATCH 17/44] Make documenting easier by giving errors that explain why + match the description users would read, I think nobody is into vectors so better so --- .../filter/StrawberryRunnersMLImagefilter.php | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index c10559b..49b3e25 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -224,7 +224,7 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { $form['sbf_fields'] = [ '#type' => 'select', '#title' => $this->t( - 'KNN Fields query against' + 'KNN Dense Vector Field to query against' ), '#description' => $this->t( 'Select the fields that will be used to query against.' @@ -276,10 +276,6 @@ public function validateOptionsForm(&$form, FormStateInterface $form_state) { $valid = FALSE; $options = $form_state->getValue('options'); $processor_id = $options['ml_strawberry_postprocessor'] ?? NULL; - if ($processor_id == NULL) { - // Can't validate yet here.Probably being setup by the user still. - return; - } $field_id = $options['sbf_fields']; if ($processor_id) { /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ @@ -293,23 +289,17 @@ public function validateOptionsForm(&$form, FormStateInterface $form_state) { if ($field_info) { // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. $propath_pieces = explode('/', $field_info->getCombinedPropertyPath()); - if (end($propath_pieces) == 'vector_' .$vector_size && $field_info->getType() == 'densevector_' . $vector_size) { - $valid = TRUE; - } - else { - $form_state->setErrorByName('ml_strawberry_postprocessor', $this->t('The Field/Processor Combination is not right. Make sure your Vector Field and processor are targeting the same Vector Dimensions')); + if (!(end($propath_pieces) == 'vector_' .$vector_size && $field_info->getType() == 'densevector_' . $vector_size)) { + $form_state->setErrorByName('options][ml_strawberry_postprocessor', $this->t('The Field/Processor combination is not right. Make sure your Configured KNN Dense Vector Field and the Strawberry Processor are targeting the same Vector Dimensions (e.g first one is from a vector_576 data source property and the field type is densevector_576 and the processor is calling YOLO)')); } } else { // The field is gone. - $form_state->setErrorByName('sbf_fields', $this->t('Configured Dense Vector field does not longer exists.')); + $form_state->setErrorByName('options][sbf_fields', $this->t('CConfigured KNN Dense Vector Field does not longer exists. Please replace your config with a valid/indexed field.')); } } } } - if ($valid) { - - } } public function submitOptionsForm(&$form, FormStateInterface $form_state) { From d4f7ac25d570fae8db6c9c9050846568064263b1 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 17 May 2024 16:01:50 -0400 Subject: [PATCH 18/44] adds topk (topK solr) as an argument --- .../filter/StrawberryRunnersMLImagefilter.php | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 49b3e25..cc1d663 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -142,7 +142,11 @@ public static function create(ContainerInterface $container, public function defineOptions() { $options = parent::defineOptions(); $options['value']['default'] = []; - $options['sbf_fields'] = ['default' => []]; + $options['sbf_fields'] = ['default' => NULL]; + $options['pre_query'] = ['default' => TRUE]; + $options['pre_query_facets'] = ['default' => TRUE]; + $options['topk'] = ['default' => 3]; + $options['ml_strawberry_postprocessor'] = ['default' => NULL]; return $options; } @@ -168,6 +172,10 @@ protected function valueSubmit($form, FormStateInterface $form_state) { $form_state = $form_state; } + protected function valueValidate($form, FormStateInterface $form_state) { + $form_state->setValue(['options', 'value'], []); + } + public function setFieldsHelper(FieldsHelperInterface $fieldsHelper) { $this->fieldsHelper = $fieldsHelper; @@ -250,6 +258,16 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { 'If any other facets will be treated as pre-queries to the actual KNN query.' ), ]; + $form['topk'] = [ + '#type' => 'number', + '#default_value' => $this->options['topk'], + '#title' => $this->t('Top Similarity KNN hits to request to the backend.'), + '#description'=> $this->t( + 'The more, the slower' + ), + '#min' => 1, + '#max' => 100, + ]; $form['ml_strawberry_postprocessor'] = [ '#type' => 'select', '#title' => $this->t( @@ -330,18 +348,6 @@ protected function valueForm(&$form, FormStateInterface $form_state) { } } - protected function valueValidate($form, FormStateInterface $form_state) { - $node_uuids = []; - if ($values = $form_state->getValue(['options', 'value'])) { - if (!is_array($values)) { (array) $values;} - foreach ($values as $value) { - $node_uuids_or_ids[] = $value; - } - sort($node_uuids_or_ids); - } - $form_state->setValue(['options', 'value'], $node_uuids_or_ids); - } - public function hasExtraOptions() { return FALSE; } From fe578cd4020790512a2a83b9ba8ee90467f1d45e Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 17 May 2024 20:17:58 -0400 Subject: [PATCH 19/44] Little bit of JSON schema magic for ML filter sanitizing Simpler than multiple if/then/what/else/who-cares situation --- .../filter/StrawberryRunnersMLImagefilter.php | 119 ++++++++++++------ 1 file changed, 82 insertions(+), 37 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index cc1d663..58c2a80 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -20,6 +20,7 @@ use Drupal\search_api\Plugin\views\query\SearchApiQuery; use Drupal\search_api\SearchApiException; use Drupal\search_api_solr\Utility\Utility; +use Drupal\strawberryfield\Tools\StrawberryfieldJsonHelper; use Drupal\views\Plugin\views\filter\FilterPluginBase; use Drupal\search_api\Plugin\views\filter\SearchApiFilterTrait; use Drupal\views\Plugin\views\filter\InOperator; @@ -41,6 +42,61 @@ class StrawberryRunnersMLImagefilter extends FilterPluginBase /* FilterPluginBase */ { + const IMAGEML_INPUT_SCHEMA = <<<'JSON' +{ + "title": "Image ML filter Input structure", + "description": "A JSON Schema describing what this filter accepts.", + "type": "object", + "properties": { + "iiif_image_id": { + "type": "string" + }, + "image_uuid": { + "type": "string" + }, + "bbox": { + "type": "object", + "properties": { + "x": { + "type": "number" + }, + "y": { + "type": "number" + }, + "w": { + "type": "number" + }, + "h": { + "type": "number" + } + }, + "required": [ + "x", + "y", + "w", + "h" + ] + } + }, + "oneOf": [ + { + "required": [ + "iiif_image_id" + ] + }, + { + "required": [ + "image_uuid" + ] + } + ], + "required": [ + "bbox" + ] +} +JSON; + + use SearchApiFilterTrait; protected $alwaysMultiple = TRUE; @@ -226,8 +282,6 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { } } - - $fields = $this->getSbfDenseVectorFields() ?? []; $form['sbf_fields'] = [ '#type' => 'select', @@ -328,7 +382,6 @@ public function submitOptionsForm(&$form, FormStateInterface $form_state) { protected function valueForm(&$form, FormStateInterface $form_state) { // At this stage $this->value is not set? - $this->value = is_array($this->value) ? $this->value : (array) $this->value; if (!$form_state->get('exposed')) { $form['value'] = [ @@ -373,6 +426,16 @@ public function query() { if (empty($this->value)) { return; } + /* + * $this->value = {stdClass} + iiif_image_id = "3b9%2Fimage-dcpl-p034-npsncr-00015-rexported-f2c69aeb-7bcb-434a-a781-e580cb3695b7.tiff" + bbox = {stdClass} + x = {float} 0.0 + y = {float} 0.0 + w = {float} 1.0 + h = {float} 1.0 + */ + // Select boxes will always generate a single value. // I could check here or cast sooner on validation? if (!is_array($this->value)) { @@ -424,57 +487,39 @@ public function validateExposed(&$form, FormStateInterface $form_state) { ) { return; } - // Exposed input for this filter is meant for power users. - // It will be a JSON with the following structure - /* - * { - * "iiif_image_id": "a IIIF id. We won't allow External Images to be used for searching for now.", - * "bbox": { - * "x": float, - * "y": float, - * "w": float, - * "w": float - * } - * } - * - */ + $this->validated_exposed_input = NULL; $identifier = $this->options['expose']['identifier']; $input = $form_state->getValue($identifier); - $values = (array) $input; if ($values) { if ($this->isExposed()) { // If already JSON - $json_input = json_decode($values[0] ?? ''); - if ($json_input !== JSON_ERROR_NONE) { + $json_input = StrawberryfieldJsonHelper::isValidJsonSchema($values[0], static::IMAGEML_INPUT_SCHEMA); + if ($json_input) { // Probably not the place to compress the data for the URL? $encoded = base64_encode(gzcompress($values[0])); - $form_state->setValue($identifier, $encoded); - $input = $form_state->getUserInput(); - $input[$identifier] = $encoded; - $form_state->setUserInput($input); + $this->validated_exposed_input = $json_input; - $filter_input = $this->view->getExposedInput(); - $filter_input[$identifier] = $encoded; - $this->view->setExposedInput($filter_input); } - else { - // check if base64 encoded then - if ($this->is_base64()) { - - $decoded = gzdecode(base64_decode($values[0])); - if ($decoded !== FALSE) { - $json_input = json_decode($values[0] ?? ''); - + elseif ($this->is_base64($values[0])) { + $decoded = gzdecode(base64_decode($values[0])); + if ($decoded !== FALSE) { + $json_input = StrawberryfieldJsonHelper::isValidJsonSchema($values[0], static::IMAGEML_INPUT_SCHEMA); + if ($json_input === JSON_ERROR_NONE) { + $this->validated_exposed_input = $json_input; } } } } - else { - + if (!$this->validated_exposed_input) { + // Check if the JSON is the right structure. + $form_state->setErrorByName($identifier, $this->t("Wrong format for the ML Image filter input")); } } + else { + // Do for non exposed. Should be directly a JSON + } } From f87f5491162073698c118bc7b6787508c1f21f45 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sat, 18 May 2024 12:29:30 -0400 Subject: [PATCH 20/44] The live backend ML vectorizing on query time is working... time to !KNN So, now that we are actually - Fetching an image - Allowing a region - calling the processor on the backend to generate vector (and object detection... will be useful in the future too) We need to start thinking of query/alter via tags. Similar to what we do with the OCR one. But there are many things here we will have to figure out... like how does KNN interact with other ones? The prequery v/s the post query and how we will deal with facets. But baby steps first. We need to add a new query option, alter/act on it and see if !KNN works first --- .../MLYoloPostProcessor.php | 2 +- .../abstractMLPostProcessor.php | 5 +- .../filter/StrawberryRunnersMLImagefilter.php | 89 +++++++++++++++++-- 3 files changed, 84 insertions(+), 12 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php index ea35ec2..cbf47a8 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -174,7 +174,7 @@ protected function yolotToMiniOCR(array $objects, $width, $height, $pageid) { } } - public function callImageML($image_url, $labels) { + public function callImageML($image_url, $labels):mixed { $nlpClient = $this->getNLPClient(); $config = $this->getConfiguration(); $arguments['iiif_image_url'] = $image_url; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php index 463cd5a..f47d623 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -313,9 +313,8 @@ public function getVectorMLInfo() { ]; } - public function callNlPwithArguments() { - return []; - } + abstract public function callImageML($image_url, $labels):mixed; + protected function getNLPClient() { if ($this->nlp_client) { diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 58c2a80..4bce158 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -10,6 +10,7 @@ use Drupal\Core\Form\FormStateInterface; use Drupal\Core\Form\OptGroup; use Drupal\Core\Language\LanguageInterface; +use Drupal\Core\StreamWrapper\StreamWrapperManager; use Drupal\Core\TypedData\ComplexDataDefinitionInterface; use Drupal\Core\TypedData\DataDefinitionInterface; use Drupal\node\NodeStorageInterface; @@ -31,6 +32,7 @@ use Drupal\Core\Entity\Element\EntityAutocomplete; use Drupal\Core\Cache\CacheBackendInterface; use Drupal\Core\Render\RenderContext; +use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\abstractMLPostProcessor; /** * Defines a filter that handles Image Similarity. @@ -356,7 +358,7 @@ public function validateOptionsForm(&$form, FormStateInterface $form_state) { $config = $plugin_config_entity->getPluginconfig(); // Note, we could also restrict to the same image mimetypes that the processor is setup to handle? if (isset($config['ml_method'])) { - $vector_size = \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$config['ml_method']] ?? ''; + $vector_size = abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$config['ml_method']] ?? ''; $field_info = $this->getSbfDenseVectorFieldSource($field_id); if ($field_info) { // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. @@ -423,7 +425,7 @@ public function buildExposeForm(&$form, FormStateInterface $form_state) { public function query() { - if (empty($this->value)) { + if (empty($this->value) || empty($this->validated_exposed_input)) { return; } /* @@ -435,13 +437,69 @@ public function query() { w = {float} 1.0 h = {float} 1.0 */ - - // Select boxes will always generate a single value. - // I could check here or cast sooner on validation? + // Just to be sure here bc we have our own way. Who knows if some external code decides to alter the value + $this->value = $this->validated_exposed_input; + // We should only be at this stage if we have validation if (!is_array($this->value)) { $this->value = (array) $this->value; } + // As always, start by processing all inline, then move to separate code for cleaner methods + // We need to load the SBR entity first here + $iiif_image_url = null; + $processor_id = $this->options['ml_strawberry_postprocessor']; + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $config = $plugin_config_entity->getPluginconfig(); + // Now we need to actually generate an instance of the runner using the config + $entity_id = $plugin_config_entity->id(); + $configuration_options = $plugin_config_entity->getPluginconfig(); + $configuration_options['configEntity'] = $entity_id; + /* @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface $plugin_instance */ + $plugin_instance + = $this->strawberryRunnerProcessorPluginManager->createInstance( + $plugin_config_entity->getPluginid(), + $configuration_options + ); + if ($plugin_instance instanceof abstractMLPostProcessor) { + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget( $this->validated_exposed_input->iiif_image_id) ?? NULL + ); + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return; + } + // basically the whole image if no bbox will be used as default + // Now prep the image for fetching. First pass, just an ID, then deal with the UUID for the file option + // pct:x,y,w,h + // !w,h + $region = 'full'; + if (isset($this->validated_exposed_input->bbox->x)) { + $region = 'pct:'.($this->validated_exposed_input->bbox->x * 100).','.($this->validated_exposed_input->bbox->y * 100).','.($this->validated_exposed_input->bbox->w * 100).','.($this->validated_exposed_input->bbox->h * 100); + } + $iiif_image_url = $config['iiif_server']."/{$iiifidentifier}/{$region}/!640,640/0/default.jpg"; + try { + $response = $plugin_instance->callImageML($iiif_image_url, []); + } + catch (\Exception $exception) { + // Give user feedback + return; + } + if (!empty($response['error'])) { + // we should log this + return; + } + else { + // Now here is an issue. Each endpoint will return the vector inside a yolo/etc. + // We should change that and make it generic (requires new pythong code/rebuilding NLP container) + // so for now i will use the ml method config split/last to get the right key. + + } + } + } + if (!$iiif_image_url) { + return; + } $query = $this->getQuery(); if (array_filter($this->value, 'is_numeric') === $this->value) { @@ -499,7 +557,7 @@ public function validateExposed(&$form, FormStateInterface $form_state) { if ($json_input) { // Probably not the place to compress the data for the URL? $encoded = base64_encode(gzcompress($values[0])); - + $form_state->setValue($identifier, $encoded); $this->validated_exposed_input = $json_input; } elseif ($this->is_base64($values[0])) { @@ -516,13 +574,17 @@ public function validateExposed(&$form, FormStateInterface $form_state) { // Check if the JSON is the right structure. $form_state->setErrorByName($identifier, $this->t("Wrong format for the ML Image filter input")); } + else { + // Else what diego? + } } else { - // Do for non exposed. Should be directly a JSON + // Do for non exposed. Should be directly a JSON? } } + public function acceptExposedInput($input) { // Called during the form submit itself.. $rc = parent::acceptExposedInput($input); @@ -533,11 +595,22 @@ public function acceptExposedInput($input) { if (isset($this->validated_exposed_input)) { $this->value = $this->validated_exposed_input; } + else { + $this->value = NULL; + } } - return $rc; } + /** + * @inheritDoc + */ + public function submitExposed(&$form, FormStateInterface $form_state) + { + parent::submitExposed($form, $form_state); // TODO: Change the autogenerated stub + } + + /** * Retrieves a list of all fields that contain in its path a Node Entity. * From f6d40702b07ec9b4c681d35880fc49c57de44503 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sat, 18 May 2024 13:15:43 -0400 Subject: [PATCH 21/44] Yep. This is working now. We need to alter based on the backend option (strawberryfield level, since that module provides the knn fields) the query and join/etc the weird magic --- .../filter/StrawberryRunnersMLImagefilter.php | 59 +++++++++++++------ 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 4bce158..8495592 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -355,10 +355,10 @@ public function validateOptionsForm(&$form, FormStateInterface $form_state) { /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); if ($plugin_config_entity->isActive()) { - $config = $plugin_config_entity->getPluginconfig(); + $sbr_config = $plugin_config_entity->getPluginconfig(); // Note, we could also restrict to the same image mimetypes that the processor is setup to handle? - if (isset($config['ml_method'])) { - $vector_size = abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$config['ml_method']] ?? ''; + if (isset($sbr_config['ml_method'])) { + $vector_size = abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']] ?? ''; $field_info = $this->getSbfDenseVectorFieldSource($field_id); if ($field_info) { // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. @@ -425,7 +425,8 @@ public function buildExposeForm(&$form, FormStateInterface $form_state) { public function query() { - if (empty($this->value) || empty($this->validated_exposed_input)) { + if (empty($this->value) || empty($this->validated_exposed_input) || !$this->getQuery()) { + // basically not validated, not present as a value and also someone cancelled/nuklled the query before? return; } /* @@ -450,7 +451,7 @@ public function query() { /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); if ($plugin_config_entity->isActive()) { - $config = $plugin_config_entity->getPluginconfig(); + $sbr_config = $plugin_config_entity->getPluginconfig(); // Now we need to actually generate an instance of the runner using the config $entity_id = $plugin_config_entity->id(); $configuration_options = $plugin_config_entity->getPluginconfig(); @@ -476,7 +477,7 @@ public function query() { if (isset($this->validated_exposed_input->bbox->x)) { $region = 'pct:'.($this->validated_exposed_input->bbox->x * 100).','.($this->validated_exposed_input->bbox->y * 100).','.($this->validated_exposed_input->bbox->w * 100).','.($this->validated_exposed_input->bbox->h * 100); } - $iiif_image_url = $config['iiif_server']."/{$iiifidentifier}/{$region}/!640,640/0/default.jpg"; + $iiif_image_url = $sbr_config['iiif_server']."/{$iiifidentifier}/{$region}/!640,640/0/default.jpg"; try { $response = $plugin_instance->callImageML($iiif_image_url, []); } @@ -488,25 +489,28 @@ public function query() { // we should log this return; } - else { + elseif (isset($response['message'])) { // Now here is an issue. Each endpoint will return the vector inside a yolo/etc. // We should change that and make it generic (requires new pythong code/rebuilding NLP container) // so for now i will use the ml method config split/last to get the right key. - - + foreach (["error","message","web64"] as $remove) { + unset($response[$remove]); + } + $all_knns = $this->getQuery()->getOption('sbf_knn') ?? []; + foreach ($response as $endpoint_key => $values) { + if (isset($values['vector']) && is_array($values['vector']) && count($values['vector']) == abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']]) { + $all_knns[$this->getPluginId()][] = $this->buildKNNQuery($this->getQuery(), $values['vector']); + } + } + array_filter($all_knns[$this->getPluginId()]); + if (count($all_knns[$this->getPluginId()])) { + $this->getQuery()->setOption('sbf_knn', $all_knns); + } } } } if (!$iiif_image_url) { return; - } - $query = $this->getQuery(); - - if (array_filter($this->value, 'is_numeric') === $this->value) { - - } - else { - } return; } @@ -663,4 +667,25 @@ protected function is_base64($s){ return true; } + + /** + * @param \Drupal\search_api\Plugin\views\query\SearchApiQuery $query + * + * @throws \Drupal\search_api\SearchApiException + */ + protected function buildKNNQuery(SearchApiQuery $query, array $vector=[]):array|null { + // We can only use Solr kids. + $solr_query_string = []; + $backend = $query->getIndex()->getServerInstance()->getBackend(); + if (!($backend instanceof \Drupal\search_api_solr\SolrBackendInterface)) { + return FALSE; + } + $allfields_translated_to_solr = $backend + ->getSolrFieldNames($query->getIndex()); + if (isset($allfields_translated_to_solr[$this->options['sbf_fields']])) { + $solr_query_string[] = "{!knn f={$allfields_translated_to_solr[$this->options['sbf_fields']]} topK={$this->options['topk']}}[" . implode(', ', $vector) . ']'; + // {!knn f=vector topK=3}[-9.01364535e-03, -7.26634488e-02, -1.73818860e-02, ..., -1.16323479e-01] + } + return $solr_query_string; + } } From fe68ce3d1d143c0a330defce9bd97ecf94a7cab1 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sat, 18 May 2024 15:30:26 -0400 Subject: [PATCH 22/44] ha! it works... not perfect and needs more validation/cleaning Plus the filter processing via exposed filters (which is indeed a different plugin and the one i really need for interactive filtering) --- .../views/filter/StrawberryRunnersMLImagefilter.php | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 8495592..8d98587 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -499,11 +499,11 @@ public function query() { $all_knns = $this->getQuery()->getOption('sbf_knn') ?? []; foreach ($response as $endpoint_key => $values) { if (isset($values['vector']) && is_array($values['vector']) && count($values['vector']) == abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']]) { - $all_knns[$this->getPluginId()][] = $this->buildKNNQuery($this->getQuery(), $values['vector']); + $all_knns[] = $this->buildKNNQuery($this->getQuery(), $values['vector']); } } - array_filter($all_knns[$this->getPluginId()]); - if (count($all_knns[$this->getPluginId()])) { + array_filter($all_knns); + if (count($all_knns)) { $this->getQuery()->setOption('sbf_knn', $all_knns); } } @@ -553,6 +553,12 @@ public function validateExposed(&$form, FormStateInterface $form_state) { $this->validated_exposed_input = NULL; $identifier = $this->options['expose']['identifier']; $input = $form_state->getValue($identifier); + if (is_string($input)) { + trim($input); + if (strlen($input) == 0) { + return; + } + } $values = (array) $input; if ($values) { if ($this->isExposed()) { From 24d234471dd49e51711ef470a184a2d2598fa840 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 20 May 2024 17:52:29 -0400 Subject: [PATCH 23/44] Oh gosh Drupal. Argument queries (even if basically a filter) are 0.0 similar! @alliomeria (*since i did not pinged you before) I think that for exposed ML arguments instead of passing a JSON i will only allow a UUID of an existing file + a fragment selector so UUID#xywh=percent:{$left},{$top},{$width},{$height}" Why? The argument is shorter/easier do decode... also will basically remove the creepness of passing JSON/and s3:// Adds though an extra file load, but that adds security (ask me if you want tomorrow why an Contextual filter, and i will answer) --- .../StrawberryRunnersMLImageArgument.php | 569 ++++++++++++++++++ strawberry_runners.module | 6 + 2 files changed, 575 insertions(+) create mode 100644 src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php diff --git a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php new file mode 100644 index 0000000..c2a7f91 --- /dev/null +++ b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php @@ -0,0 +1,569 @@ +setSbrEntityStorage( + $container->get('entity_type.manager')->getStorage('strawberry_runners_postprocessor') + ); + $plugin->setFieldsHelper($container->get('search_api.fields_helper')); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setCache($container->get('cache.default')); + $plugin->currentUser = $container->get('current_user'); + $plugin->strawberryRunnerUtilityService = $container->get( + 'strawberry_runner.utility' + ); + $plugin->strawberryRunnerProcessorPluginManager = $container->get( + 'strawberry_runner.processor_manager' + ); + return $plugin; + } + + + /** + * {@inheritdoc} + */ + public function defineOptions() { + $options = parent::defineOptions(); + $options['value']['default'] = []; + $options['sbf_fields'] = ['default' => NULL]; + $options['pre_query'] = ['default' => TRUE]; + $options['pre_query_facets'] = ['default' => TRUE]; + $options['topk'] = ['default' => 3]; + $options['ml_strawberry_postprocessor'] = ['default' => NULL]; + return $options; + } + + public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage) + { + $this->sbrEntityStorage = $sbrEntityStorage; + return $this; + } + + + protected function valueSubmit($form, FormStateInterface $form_state) { + $form_state = $form_state; + } + + public function setFieldsHelper(FieldsHelperInterface $fieldsHelper) { + $this->fieldsHelper = $fieldsHelper; + return $this; + } + + /** + * Sets the View Storage. + * + * @param \Drupal\Core\Entity\EntityStorageInterface $viewstorage + * The view Storage. + * + * @return $this + */ + public function setViewStorage(EntityStorageInterface $viewstorage) { + $this->viewStorage = $viewstorage; + return $this; + } + + /** + * Sets the Cache Backed. + * + * @param \Drupal\Core\Cache\CacheBackendInterface $cache + * The cache backend. Use to store complex calculations of property paths. + * + * @return $this + */ + public function setCache(CacheBackendInterface $cache) { + $this->cache = $cache; + return $this; + } + + public function showOperatorForm(&$form, FormStateInterface $form_state) { + } + + /** + * {@inheritdoc} + */ + public function buildOptionsForm(&$form, FormStateInterface $form_state) { + parent::buildOptionsForm($form, $form_state); + $active_plugins = $this->strawberryRunnerUtilityService->getActivePluginConfigs(FALSE); + + foreach ($active_plugins as $by_source => $plugins) { + foreach ($plugins as $entity_id => $active_plugin) { + if (isset($active_plugin['ml_method'])) { + $post_processor_options[$entity_id] = $active_plugin['ml_method'] ."({$entity_id})"; + } + } + } + + $fields = $this->getSbfDenseVectorFields() ?? []; + $form['sbf_fields'] = [ + '#type' => 'select', + '#title' => $this->t( + 'KNN Dense Vector Field to query against' + ), + '#description' => $this->t( + 'Select the fields that will be used to query against.' + ), + '#options' => $fields, + '#multiple' => FALSE, + '#default_value' => $this->options['sbf_fields'], + '#required' => TRUE, + ]; + $form['pre_query'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query'], + '#title' => $this->t('Treat previous filters to this as prequeries'), + '#description'=> $this->t( + 'If any other filter setup before this one will be treated as pre-queries to the actual KNN query.' + ), + ]; + $form['pre_query_facets'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query_facets'], + '#title' => $this->t('Treat also facets, if any, as prequeries'), + '#description'=> $this->t( + 'If any other facets will be treated as pre-queries to the actual KNN query.' + ), + ]; + $form['topk'] = [ + '#type' => 'number', + '#default_value' => $this->options['topk'], + '#title' => $this->t('Top Similarity KNN hits to request to the backend.'), + '#description'=> $this->t( + 'The more, the slower' + ), + '#min' => 1, + '#max' => 100, + ]; + $form['ml_strawberry_postprocessor'] = [ + '#type' => 'select', + '#title' => $this->t( + 'Strawberry Runners processor to extract the on-the fly embedding' + ), + '#description' => $this->t( + 'Select the ML Strawberry Runners Processor that was used to index Vectors into the field you are going to search against. These need to match' + ), + '#options' => $post_processor_options, + '#multiple' => FALSE, + '#default_value' => $this->options['ml_strawberry_postprocessor'], + '#required' => TRUE, + ]; + } + /** + * Validate the options form. + */ + public function validateOptionsForm(&$form, FormStateInterface $form_state) { + // We need to validate that the selected field is of the same source/size as model that will + // be used to generate the on the fly vectors. + // So we need to load the SBR entity passed, compare the model against the constant present in + // \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE + // and then load the field and see if the source (is of the same SBFlavor property/size (vector_576, etc) + $options = $form_state->getValue('options'); + $processor_id = $options['ml_strawberry_postprocessor'] ?? NULL; + $field_id = $options['sbf_fields']; + if ($processor_id) { + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Note, we could also restrict to the same image mimetypes that the processor is setup to handle? + if (isset($sbr_config['ml_method'])) { + $vector_size = abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']] ?? ''; + $field_info = $this->getSbfDenseVectorFieldSource($field_id); + if ($field_info) { + // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. + $propath_pieces = explode('/', $field_info->getCombinedPropertyPath()); + if (!(end($propath_pieces) == 'vector_' .$vector_size && $field_info->getType() == 'densevector_' . $vector_size)) { + $form_state->setErrorByName('options][ml_strawberry_postprocessor', $this->t('The Field/Processor combination is not right. Make sure your Configured KNN Dense Vector Field and the Strawberry Processor are targeting the same Vector Dimensions (e.g first one is from a vector_576 data source property and the field type is densevector_576 and the processor is calling YOLO)')); + } + } + else { + // The field is gone. + $form_state->setErrorByName('options][sbf_fields', $this->t('CConfigured KNN Dense Vector Field does not longer exists. Please replace your config with a valid/indexed field.')); + } + } + } + } + } + + public function submitOptionsForm(&$form, FormStateInterface $form_state) { + parent::submitOptionsForm( + $form, $form_state + ); + } + + + /** + * Set the input for this argument. + * + * @return TRUE if it successfully validates; FALSE if it does not. + */ + public function setArgument($arg) { + $this->argument = $arg; + return $this->validateArgument($arg); + } + + + public function query($group_by = FALSE) { + $this->argument_validated; + if (empty($this->expanded_argument) || ! $this->query) { + // basically not validated, not present as a value and also someone cancelled/nuklled the query before? + return; + } + /* + * $this->value = {stdClass} + iiif_image_id = "3b9%2Fimage-dcpl-p034-npsncr-00015-rexported-f2c69aeb-7bcb-434a-a781-e580cb3695b7.tiff" + bbox = {stdClass} + x = {float} 0.0 + y = {float} 0.0 + w = {float} 1.0 + h = {float} 1.0 + */ + // Just to be sure here bc we have our own way. Who knows if some external code decides to alter the value + $this->value = $this->expanded_argument; + // We should only be at this stage if we have validation + // As always, start by processing all inline, then move to separate code for cleaner methods + // We need to load the SBR entity first here + $iiif_image_url = null; + $processor_id = $this->options['ml_strawberry_postprocessor']; + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Now we need to actually generate an instance of the runner using the config + $entity_id = $plugin_config_entity->id(); + $configuration_options = $plugin_config_entity->getPluginconfig(); + $configuration_options['configEntity'] = $entity_id; + /* @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface $plugin_instance */ + $plugin_instance + = $this->strawberryRunnerProcessorPluginManager->createInstance( + $plugin_config_entity->getPluginid(), + $configuration_options + ); + if ($plugin_instance instanceof abstractMLPostProcessor) { + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget($this->value->iiif_image_id) ?? NULL + ); + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return; + } + // basically the whole image if no bbox will be used as default + // Now prep the image for fetching. First pass, just an ID, then deal with the UUID for the file option + // pct:x,y,w,h + // !w,h + $region = 'full'; + if (isset($this->value->bbox->x)) { + $region = 'pct:'.($this->value->bbox->x * 100).','.($this->value->bbox->y * 100).','.($this->value->bbox->w * 100).','.($this->value->bbox->h * 100); + } + $iiif_image_url = $sbr_config['iiif_server']."/{$iiifidentifier}/{$region}/!640,640/0/default.jpg"; + try { + $response = $plugin_instance->callImageML($iiif_image_url, []); + } + catch (\Exception $exception) { + // Give user feedback + return; + } + if (!empty($response['error'])) { + // we should log this + return; + } + elseif (isset($response['message'])) { + // Now here is an issue. Each endpoint will return the vector inside a yolo/etc. + // We should change that and make it generic (requires new pythong code/rebuilding NLP container) + // so for now i will use the ml method config split/last to get the right key. + foreach (["error","message","web64"] as $remove) { + unset($response[$remove]); + } + $all_knns = $this->query->getOption('sbf_knn') ?? []; + foreach ($response as $endpoint_key => $values) { + if (isset($values['vector']) && is_array($values['vector']) && count($values['vector']) == abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']]) { + $all_knns[] = $this->buildKNNQuery($this->query, $values['vector']); + } + } + array_filter($all_knns); + if (count($all_knns)) { + $this->query->setOption('sbf_knn', $all_knns); + } + } + } + } + if (!$iiif_image_url) { + return; + } + return; + } + + public function validateArgument($arg) { + + $this->expanded_argument = NULL; + + // By using % in URLs, arguments could be validated twice; this eases + // that pain. + if (isset($this->argument_validated)) { + return $this->argument_validated; + } + + if ($this->isException($arg)) { + return $this->argument_validated = TRUE; + } + + $plugin = $this->getPlugin('argument_validator'); + //return $this->argument_validated = $plugin->validateArgument($arg); + if ($arg) { + // If already JSON + $json_input = StrawberryfieldJsonHelper::isValidJsonSchema($arg, static::IMAGEML_INPUT_SCHEMA); + if ($json_input) { + // Probably not the place to compress the data for the URL? + $encoded = base64_encode(gzcompress($arg)); + $form_state->setValue($identifier, $encoded); + $this->expanded_argument = $json_input; + } // WE uRL decode because base64 might contain "/" which is the argument separator. So we pre-encoded it. + elseif ($this->is_base64(urldecode($arg))) { + $decoded = gzuncompress(base64_decode(urldecode("eJxNzssOgjAQheE1JLyD6ZqB0ivwMqS3wRoVojFqCO9uwZi4%2B7%2FZnFmKPCMxRhzixYxhiJ70B3LnfV2bTtX7EURwjeYCrZDKdoZZJ1xgwgT0lCO2MBt7nuAabg9voOkUBy1pq2lgwLz0IJRk0KKg4IzSqIJ1yHV1mkdSbg9YO73S7pI6I1vRipY73v94JjQ%2FHL9IvRb5%2BgG%2BNTFV"))); + + if ($decoded !== FALSE) { + $json_input = StrawberryfieldJsonHelper::isValidJsonSchema($decoded, static::IMAGEML_INPUT_SCHEMA); + if ($json_input !== FALSE) { + $this->expanded_argument = $json_input; + } + } + } + if (!$this->expanded_argument) { + $this->argument_validated = FALSE; + } + else { + if ($this->expanded_argument->iiif_image_id && !(empty($this->expanded_argument->iiif_image_id))) { + $image_id = StreamWrapperManager::getTarget($this->expanded_argument->iiif_image_id); + // means passed without a streamwrapper + if ($image_id) { + $this->argument_validated = TRUE; + } + } + } + } + return $this->argument_validated ?? FALSE; + } + + + /** + * Retrieves a list of all fields that contain in its path a Node Entity. + * + * @return string[] + * An options list of field identifiers mapped to their prefixed + * labels. + */ + protected function getSbfDenseVectorFields() { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + + $fields_info = $index->getFields(); + foreach ($fields_info as $field_id => $field) { + //if (($field->getDatasourceId() == 'strawberryfield_flavor_datasource') && ($field->getType() == "integer")) { + // Anything except text, fulltext or any solr_text variations. Also skip direct node id and UUIDs which would + // basically return the same ADO as input filtered, given that those are unique. + $property_path = $field->getPropertyPath(); + $datasource_id = $field->getDatasourceId(); + if (str_starts_with($field->getType(), 'densevector_') === TRUE) { + $field->getDataDefinition(); + $fields[$field_id] = $field->getPrefixedLabel() . '('. $field->getFieldIdentifier() .')'; + } + } + return $fields; + } + + protected function getSbfDenseVectorFieldSource($field_id) { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + $fields_info = $index->getField($field_id); + return $fields_info; + } + + protected function getExistingDenseVectorForImage($uri, $field) { + + } + + protected function is_base64($s){ + // Check if there are valid base64 characters + if (!preg_match('/^[a-zA-Z0-9\/\r\n+]*={0,2}$/', $s)) return false; + + // Decode the string in strict mode and check the results + $decoded = base64_decode($s, true); + if(false === $decoded) return false; + + // Encode the string again + if(base64_encode($decoded) != $s) return false; + + return true; + } + + /** + * @param \Drupal\search_api\Plugin\views\query\SearchApiQuery $query + * + * @throws \Drupal\search_api\SearchApiException + */ + protected function buildKNNQuery(SearchApiQuery $query, array $vector=[]):array|null { + // We can only use Solr kids. + $solr_query_string = []; + $backend = $query->getIndex()->getServerInstance()->getBackend(); + if (!($backend instanceof \Drupal\search_api_solr\SolrBackendInterface)) { + return FALSE; + } + $allfields_translated_to_solr = $backend + ->getSolrFieldNames($query->getIndex()); + if (isset($allfields_translated_to_solr[$this->options['sbf_fields']])) { + $solr_query_string[] = "{!knn f={$allfields_translated_to_solr[$this->options['sbf_fields']]} topK={$this->options['topk']}}[" . implode(', ', $vector) . ']'; + // {!knn f=vector topK=3}[-9.01364535e-03, -7.26634488e-02, -1.73818860e-02, ..., -1.16323479e-01] + } + return $solr_query_string; + } +} diff --git a/strawberry_runners.module b/strawberry_runners.module index 1d77240..cf7df2f 100644 --- a/strawberry_runners.module +++ b/strawberry_runners.module @@ -30,6 +30,12 @@ function strawberry_runners_views_data_alter(array &$data) { 'field' => 'id', 'id' => 'sbr_imageml_filter', ], + 'argument' => [ + 'title' => t('Image Similarity Filter via KNN '), + 'field' => 'id', + 'id' => 'sbr_imageml_filter', + 'disable_break_phrase' => TRUE, // Disallows multiple values for ML fields + ], ]; if ($ml_image_filter != 'sbr_imageml_filter') { $table[$ml_image_filter]['real field'] = 'sbr_imageml_filter'; From 7722afbc2c5f4eadd85449f650289dee19a7ef5d Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 20 May 2024 17:53:07 -0400 Subject: [PATCH 24/44] Clean up the demo filter ... This filter could also allow a "hide" option... so basically only to be set via URL arguments ... right? @alliomeria right? --- .../filter/StrawberryRunnersMLImagefilter.php | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 8d98587..1ecc767 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -431,7 +431,7 @@ public function query() { } /* * $this->value = {stdClass} - iiif_image_id = "3b9%2Fimage-dcpl-p034-npsncr-00015-rexported-f2c69aeb-7bcb-434a-a781-e580cb3695b7.tiff" + iiif_image_id = "s3://3b9%2Fimage-dcpl-p034-npsncr-00015-rexported-f2c69aeb-7bcb-434a-a781-e580cb3695b7.tiff" bbox = {stdClass} x = {float} 0.0 y = {float} 0.0 @@ -571,10 +571,10 @@ public function validateExposed(&$form, FormStateInterface $form_state) { $this->validated_exposed_input = $json_input; } elseif ($this->is_base64($values[0])) { - $decoded = gzdecode(base64_decode($values[0])); + $decoded = gzuncompress(base64_decode($values[0])); if ($decoded !== FALSE) { $json_input = StrawberryfieldJsonHelper::isValidJsonSchema($values[0], static::IMAGEML_INPUT_SCHEMA); - if ($json_input === JSON_ERROR_NONE) { + if ($json_input !== FALSE) { $this->validated_exposed_input = $json_input; } } @@ -585,7 +585,13 @@ public function validateExposed(&$form, FormStateInterface $form_state) { $form_state->setErrorByName($identifier, $this->t("Wrong format for the ML Image filter input")); } else { - // Else what diego? + if ($this->validated_exposed_input->iiif_image_id && !(empty($this->validated_exposed_input->iiif_image_id))) { + $image_id = StreamWrapperManager::getTarget($this->validated_exposed_input->iiif_image_id); + // means passed without a streamwrapper + if (!$image_id) { + $form_state->setErrorByName($identifier, $this->t("Wrong format for the ML IIIF Image ID property. Make sure it contains a streamwrapper (e.g s3://)")); + } + } } } else { From 8151899a4235169e1c8ef5a290cf726268360ec9 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 20 May 2024 21:18:11 -0400 Subject: [PATCH 25/44] Remove non sense here. Still need to rewrite how this one reads data pack("*c" and stuff) --- .../views/argument/StrawberryRunnersMLImageArgument.php | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php index c2a7f91..87d6dd0 100644 --- a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php +++ b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php @@ -459,14 +459,7 @@ public function validateArgument($arg) { //return $this->argument_validated = $plugin->validateArgument($arg); if ($arg) { // If already JSON - $json_input = StrawberryfieldJsonHelper::isValidJsonSchema($arg, static::IMAGEML_INPUT_SCHEMA); - if ($json_input) { - // Probably not the place to compress the data for the URL? - $encoded = base64_encode(gzcompress($arg)); - $form_state->setValue($identifier, $encoded); - $this->expanded_argument = $json_input; - } // WE uRL decode because base64 might contain "/" which is the argument separator. So we pre-encoded it. - elseif ($this->is_base64(urldecode($arg))) { + if ($this->is_base64(urldecode($arg))) { $decoded = gzuncompress(base64_decode(urldecode("eJxNzssOgjAQheE1JLyD6ZqB0ivwMqS3wRoVojFqCO9uwZi4%2B7%2FZnFmKPCMxRhzixYxhiJ70B3LnfV2bTtX7EURwjeYCrZDKdoZZJ1xgwgT0lCO2MBt7nuAabg9voOkUBy1pq2lgwLz0IJRk0KKg4IzSqIJ1yHV1mkdSbg9YO73S7pI6I1vRipY73v94JjQ%2FHL9IvRb5%2BgG%2BNTFV"))); if ($decoded !== FALSE) { From 2c20fed2f7e476b6e9117147762cf2c2fd283103 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 23 May 2024 10:18:43 -0400 Subject: [PATCH 26/44] More supporting ML code commit (tested) Processors are getting cleaner (not there yet) and Image Argument filter less beef-fy --- .../MLMobileNetPostProcessor.php | 187 ++++++++++++++++++ .../MLYoloPostProcessor.php | 6 +- .../abstractMLPostProcessor.php | 2 +- .../StrawberryRunnersMLImageArgument.php | 168 +++++++--------- 4 files changed, 262 insertions(+), 101 deletions(-) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php new file mode 100644 index 0000000..b55ee57 --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php @@ -0,0 +1,187 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => '/image/mobilenet', + ] + parent::defaultConfiguration(); + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + $element = parent::settingsForm($parents, $form_state); + return $element; + } + + protected function runTextMLfromMetadata($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + return $output; + // TODO: Implement runTextMLfromMetadata() method. + } + + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + $config = $this->getConfiguration(); + $input_argument = $this->pluginDefinition['input_argument']; + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; + $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; + if (!($width && $height)) { + $width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL; + $height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL; + } + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget( isset($io->input->metadata['url']) ? $io->input->metadata['url'] : NULL) + ); + + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return $output; + } + /// Mobilenet does its own (via mediapipe) image scalling. So we can pass a smaller if needed. Internally + /// it uses 480 x 480 but not good to pass square bc it makes % bbox calculation harder. + // But requires us to call info.json and pre-process the sizes. + $iiif_image_url = $config['iiif_server']."/{$iiifidentifier}/full/full/0/default.jpg"; + //@TODO we are not filtering here by label yet. Next release. + $labels = []; + $page_text = NULL; + $output->plugin = NULL; + $labels = []; + $ML = $this->callImageML($iiif_image_url,$labels); + $output->searchapi['vector_1024'] = isset($ML['mobilenet']['vector']) && is_array($ML['mobilenet']['vector']) && count($ML['mobilenet']['vector'])== 1024 ? $ML['mobilenet']['vector'] : NULL; + if (isset($ML['mobilenet']['objects']) && is_array($ML['mobilenet']['objects']) && count($ML['mobilenet']['objects']) > 0 ) { + $miniocr = $this->mobilenetToMiniOCR($ML['mobilenet']['objects'], $width, $height, $sequence_number); + $output->searchapi['fulltext'] = $miniocr; + $output->plugin = $miniocr; + $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", + PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; + // What is a good confidence ratio here? + // based on the % of the bounding box? + // Just the value? + foreach($ML['mobilenet']['objects'] as $object) { + $labels[$object['name']] = $object['name']; + } + } + $output->searchapi['metadata'] = $labels; + $output->searchapi['service_md5'] = isset($ML['mobilenet']['modelinfo']) ? md5(json_encode($ML['mobilenet']['modelinfo'])) : NULL; + $output->searchapi['plaintext'] = $page_text ?? ''; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("MobileNet ML Image Embeddings & Vectors") . ' ' . $sequence_number; + return $output; + } + + + protected function mobilenetToMiniOCR(array $objects, $width, $height, $pageid) { + $miniocr = new \XMLWriter(); + $miniocr->openMemory(); + $miniocr->startDocument('1.0', 'UTF-8'); + $miniocr->startElement("ocr"); + $atleastone_word = FALSE; + // To avoid divisions by 0 + $pwidth = (float) $width; + $pheight = (float) $height; + // NOTE: floats are in the form of .1 so we need to remove the first 0. + $miniocr->startElement("p"); + $miniocr->writeAttribute("xml:id", 'ml_yolo_' . $pageid); + $miniocr->writeAttribute("wh", + ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); + $miniocr->startElement("b"); + foreach ($objects as $object) { + $notFirstWord = FALSE; + $miniocr->startElement("l"); + $x0 = (float) $object['box']['x1']; + $y0 = (float) $object['box']['y1']; + $x1 = (float) $object['box']['x2']; + $y1 = (float) $object['box']['y2']; + $l = ltrim(sprintf('%.3f', $x0) ?? '', 0); + $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); + $w = ltrim(sprintf('%.3f', ($x1 - $x0)) ?? '', 0); + $h = ltrim(sprintf('%.3f', ($y1 - $y0)) ?? '', 0); + $text = (string) ($object['name'] ?? 'Unlabeled') .' ~ '. (string) ("{$object['confidence']}" ?? "0"); + if ($notFirstWord) { + $miniocr->text(' '); + } + $notFirstWord = TRUE; + // New OCR Highlight does not like empty tags at all + if (strlen(trim($text ?? '')) > 0) { + $miniocr->startElement("w"); + $miniocr->writeAttribute("x", + $l . ' ' . $t . ' ' . $w . ' ' . $h); + $miniocr->text($text); + // Only assume we have at least one word for tags + // Since lines? could end empty? + $atleastone_word = TRUE; + $miniocr->endElement(); + } + $miniocr->endElement(); + } + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endDocument(); + if ($atleastone_word) { + return $miniocr->outputMemory(TRUE); + } + else { + return StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; + } + } + + public function callImageML($image_url, $labels):mixed { + $nlpClient = $this->getNLPClient(); + $config = $this->getConfiguration(); + $arguments['iiif_image_url'] = $image_url; + //@TODO we are not filtering here by label yet. Next release. + $arguments['labels'] = $labels; + $ML = $nlpClient->get_call($config['ml_method'], $arguments, 1); + return $ML; + } + +} diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php index cbf47a8..010167f 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -93,10 +93,10 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { $page_text = NULL; $output->plugin = NULL; $labels = []; - $ML = callImageML($iiif_image_url,$labels); + $ML = $this->callImageML($iiif_image_url,$labels); $output->searchapi['vector_576'] = isset($ML['yolo']['vector']) && is_array($ML['yolo']['vector']) && count($ML['yolo']['vector'])== 576 ? $ML['yolo']['vector'] : NULL; if (isset($ML['yolo']['objects']) && is_array($ML['yolo']['objects']) && count($ML['yolo']['objects']) > 0 ) { - $miniocr = $this->yolotToMiniOCR($ML['yolo']['objects'], $width, $height, $sequence_number); + $miniocr = $this->yoloToMiniOCR($ML['yolo']['objects'], $width, $height, $sequence_number); $output->searchapi['fulltext'] = $miniocr; $output->plugin = $miniocr; $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", @@ -118,7 +118,7 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { } - protected function yolotToMiniOCR(array $objects, $width, $height, $pageid) { + protected function yoloToMiniOCR(array $objects, $width, $height, $pageid) { $miniocr = new \XMLWriter(); $miniocr->openMemory(); $miniocr->startDocument('1.0', 'UTF-8'); diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php index f47d623..4fe1418 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -34,7 +34,7 @@ public function defaultConfiguration() { 'language_default' => 'eng', 'timeout' => 300, 'nlp_url' => 'http://esmero-nlp:6400', - 'ml_method' => 'yolov8', + 'ml_method' => NULL, 'iiif_server' => '', ] + parent::defaultConfiguration(); } diff --git a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php index 87d6dd0..e6eaa1a 100644 --- a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php +++ b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php @@ -4,6 +4,7 @@ use Drupal\Core\Form\FormStateInterface; use Drupal\Core\StreamWrapper\StreamWrapperManager; +use Drupal\file\Entity\File; use Drupal\search_api\Entity\Index; use Drupal\search_api\Plugin\views\argument\SearchApiStandard; use Drupal\search_api\Plugin\views\query\SearchApiQuery; @@ -21,62 +22,7 @@ * * @ViewsArgument("sbr_imageml_filter") */ -class StrawberryRunnersMLImageArgument extends SearchApiStandard -{ - - const IMAGEML_INPUT_SCHEMA = <<<'JSON' -{ - "title": "Image ML filter Input structure", - "description": "A JSON Schema describing what this filter accepts.", - "type": "object", - "properties": { - "iiif_image_id": { - "type": "string" - }, - "image_uuid": { - "type": "string" - }, - "bbox": { - "type": "object", - "properties": { - "x": { - "type": "number" - }, - "y": { - "type": "number" - }, - "w": { - "type": "number" - }, - "h": { - "type": "number" - } - }, - "required": [ - "x", - "y", - "w", - "h" - ] - } - }, - "oneOf": [ - { - "required": [ - "iiif_image_id" - ] - }, - { - "required": [ - "image_uuid" - ] - } - ], - "required": [ - "bbox" - ] -} -JSON; +class StrawberryRunnersMLImageArgument extends SearchApiStandard { /** * Is argument validated. @@ -90,11 +36,18 @@ class StrawberryRunnersMLImageArgument extends SearchApiStandard public $expanded_argument = NULL; /** - * The Entity Type manager + * The SBR Entity Type Storage + * + * @var \Drupal\Core\Entity\EntityStorageInterface + */ + protected $sbrEntityStorage; + + /** + * The File Entity Type Storage * * @var \Drupal\Core\Entity\EntityStorageInterface */ - protected $sbrEntityStorage; + protected $fileEntityStorage; /** * The vocabulary storage. @@ -150,6 +103,9 @@ public static function create(ContainerInterface $container, array $configuratio $plugin->setSbrEntityStorage( $container->get('entity_type.manager')->getStorage('strawberry_runners_postprocessor') ); + $plugin->setFileEntityStorage( + $container->get('entity_type.manager')->getStorage('file') + ); $plugin->setFieldsHelper($container->get('search_api.fields_helper')); $plugin->setViewStorage( $container->get('entity_type.manager')->getStorage('view') @@ -189,6 +145,12 @@ public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage) return $this; } + public function setFileEntityStorage(EntityStorageInterface $fileEntityStorage) + { + $this->fileEntityStorage = $fileEntityStorage; + return $this; + } + protected function valueSubmit($form, FormStateInterface $form_state) { $form_state = $form_state; @@ -242,7 +204,6 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { } } } - $fields = $this->getSbfDenseVectorFields() ?? []; $form['sbf_fields'] = [ '#type' => 'select', @@ -358,15 +319,6 @@ public function query($group_by = FALSE) { // basically not validated, not present as a value and also someone cancelled/nuklled the query before? return; } - /* - * $this->value = {stdClass} - iiif_image_id = "3b9%2Fimage-dcpl-p034-npsncr-00015-rexported-f2c69aeb-7bcb-434a-a781-e580cb3695b7.tiff" - bbox = {stdClass} - x = {float} 0.0 - y = {float} 0.0 - w = {float} 1.0 - h = {float} 1.0 - */ // Just to be sure here bc we have our own way. Who knows if some external code decides to alter the value $this->value = $this->expanded_argument; // We should only be at this stage if we have validation @@ -397,11 +349,9 @@ public function query($group_by = FALSE) { } // basically the whole image if no bbox will be used as default // Now prep the image for fetching. First pass, just an ID, then deal with the UUID for the file option - // pct:x,y,w,h - // !w,h $region = 'full'; if (isset($this->value->bbox->x)) { - $region = 'pct:'.($this->value->bbox->x * 100).','.($this->value->bbox->y * 100).','.($this->value->bbox->w * 100).','.($this->value->bbox->h * 100); + $region = 'pct:'.($this->value->bbox->x).','.($this->value->bbox->y).','.($this->value->bbox->w).','.($this->value->bbox->h); } $iiif_image_url = $sbr_config['iiif_server']."/{$iiifidentifier}/{$region}/!640,640/0/default.jpg"; try { @@ -411,11 +361,7 @@ public function query($group_by = FALSE) { // Give user feedback return; } - if (!empty($response['error'])) { - // we should log this - return; - } - elseif (isset($response['message'])) { + if (isset($response['message'])) { // Now here is an issue. Each endpoint will return the vector inside a yolo/etc. // We should change that and make it generic (requires new pythong code/rebuilding NLP container) // so for now i will use the ml method config split/last to get the right key. @@ -457,29 +403,57 @@ public function validateArgument($arg) { $plugin = $this->getPlugin('argument_validator'); //return $this->argument_validated = $plugin->validateArgument($arg); - if ($arg) { - // If already JSON - if ($this->is_base64(urldecode($arg))) { - $decoded = gzuncompress(base64_decode(urldecode("eJxNzssOgjAQheE1JLyD6ZqB0ivwMqS3wRoVojFqCO9uwZi4%2B7%2FZnFmKPCMxRhzixYxhiJ70B3LnfV2bTtX7EURwjeYCrZDKdoZZJ1xgwgT0lCO2MBt7nuAabg9voOkUBy1pq2lgwLz0IJRk0KKg4IzSqIJ1yHV1mkdSbg9YO73S7pI6I1vRipY73v94JjQ%2FHL9IvRb5%2BgG%2BNTFV"))); - - if ($decoded !== FALSE) { - $json_input = StrawberryfieldJsonHelper::isValidJsonSchema($decoded, static::IMAGEML_INPUT_SCHEMA); - if ($json_input !== FALSE) { - $this->expanded_argument = $json_input; - } + if ($arg && $this->is_base64(urldecode($arg))) { + // Because of actual implementation (JS to PHP) details changes are this will come from a JS encoded gzip that needs to be unpacked + // to try that first. On JS using pako with gzip is the ideal way. + // if unpacked it will be actuall an string encoded array (utf8, just numbers) + $arg = urldecode(base64_decode(urldecode($arg))); + $decoded = NULL; + $unpacked_deflated = explode(",", $arg); + if (count($unpacked_deflated) > 2) { + try { + $decoded = gzdecode(pack("c*",...$unpacked_deflated)); } - } - if (!$this->expanded_argument) { - $this->argument_validated = FALSE; - } - else { - if ($this->expanded_argument->iiif_image_id && !(empty($this->expanded_argument->iiif_image_id))) { - $image_id = StreamWrapperManager::getTarget($this->expanded_argument->iiif_image_id); - // means passed without a streamwrapper - if ($image_id) { - $this->argument_validated = TRUE; - } + catch (\Exception $e) { + // Ok was not that so we try another method + } + } + if (!$decoded) { + $decoded = gzuncompress($arg); + + + } + if ($decoded) { + $decoded_object = json_decode($decoded); + if ($decoded_object) { + if (!empty($decoded_object->fileuuid ?? NULL) && + !empty($decoded_object->nodeuuid ?? NULL) && + !empty($decoded_object->fragment ?? NULL)) { + $files = $this->fileEntityStorage->loadByProperties(['uuid' => $decoded_object->fileuuid]); + //@TODO for security. Check if the file is attached to the node too. + $file = reset($files); + /* @var File $file */ + if ($file) { + $this->expanded_argument = new \stdClass; + $this->expanded_argument->iiif_image_id = $file->getFileUri(); + $fragment_pieces = explode("xywh=percent:",$decoded_object->fragment); + if (count($fragment_pieces) == 2) { + $xywh = explode(",", $fragment_pieces[1]); + if (count($xywh) == 4) { + // we got them all + $this->expanded_argument->bbox = (object) array_combine(['x','y','w','h'], $xywh); + $this->argument_validated = TRUE; + } + } + } } + } + /* const image_data = { + "fileuuid": groupssetting.file_uuid, + "nodeuuid": groupssetting.nodeuuid, + "fragment": annotation.target.selector.value, + "textualbody": annotation.body?.value + } */ } } return $this->argument_validated ?? FALSE; From 2e7f8837226508ef87b02d2cbc5d9a2a33d2fe94 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 23 May 2024 13:20:14 -0400 Subject: [PATCH 27/44] Why am i so distracted? wrong accessor --- .../AbstractPostProcessorQueueWorker.php | 2 +- .../MLMobileNetPostProcessor.php | 62 +++++++++++-------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index 082f357..6aef559 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -384,7 +384,7 @@ public function processItem($data) { $toindex->vector_512 = $io->output->searchapi['vector_512'] ?? NULL; $toindex->vector_576 = $io->output->searchapi['vector_576'] ?? NULL; $toindex->vector_1024 = $io->output->searchapi['vector_1024'] ?? NULL; - $toindex->service_md5 = $io->output->searchapi['vector_1024'] ?? ''; + $toindex->service_md5 = $io->output->searchapi['service_md5'] ?? ''; // $siblings will be the amount of total children processors that were // enqueued for a single Processor chain. diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php index b55ee57..d049bad 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php @@ -105,7 +105,9 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { // based on the % of the bounding box? // Just the value? foreach($ML['mobilenet']['objects'] as $object) { - $labels[$object['name']] = $object['name']; + if (isset($category['category_name'])) { + $labels[$category['category_name']] = $category['category_name']; + } } } $output->searchapi['metadata'] = $labels; @@ -127,40 +129,48 @@ protected function mobilenetToMiniOCR(array $objects, $width, $height, $pageid) // To avoid divisions by 0 $pwidth = (float) $width; $pheight = (float) $height; + // Format here is different. Instead of normalizing on Python we do here? + // @TODO make all methods in python act the same + // :[{"bounding_box":{"height":0.9609375,"origin_x":0.0,"origin_y":0.0453125,"width":1.0},"categories":[{"category_name":"person","display_name":null,"index":null,"score":0.8881509304046631}] // NOTE: floats are in the form of .1 so we need to remove the first 0. $miniocr->startElement("p"); - $miniocr->writeAttribute("xml:id", 'ml_yolo_' . $pageid); + $miniocr->writeAttribute("xml:id", 'ml_mobilenet_' . $pageid); $miniocr->writeAttribute("wh", ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); $miniocr->startElement("b"); foreach ($objects as $object) { $notFirstWord = FALSE; - $miniocr->startElement("l"); - $x0 = (float) $object['box']['x1']; - $y0 = (float) $object['box']['y1']; - $x1 = (float) $object['box']['x2']; - $y1 = (float) $object['box']['y2']; - $l = ltrim(sprintf('%.3f', $x0) ?? '', 0); - $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); - $w = ltrim(sprintf('%.3f', ($x1 - $x0)) ?? '', 0); - $h = ltrim(sprintf('%.3f', ($y1 - $y0)) ?? '', 0); - $text = (string) ($object['name'] ?? 'Unlabeled') .' ~ '. (string) ("{$object['confidence']}" ?? "0"); - if ($notFirstWord) { - $miniocr->text(' '); - } - $notFirstWord = TRUE; - // New OCR Highlight does not like empty tags at all - if (strlen(trim($text ?? '')) > 0) { - $miniocr->startElement("w"); - $miniocr->writeAttribute("x", - $l . ' ' . $t . ' ' . $w . ' ' . $h); - $miniocr->text($text); - // Only assume we have at least one word for tags - // Since lines? could end empty? - $atleastone_word = TRUE; + if ($object['bounding_box'] ?? FALSE) { + $miniocr->startElement("l"); + $x0 = (float)$object['bounding_box']['origin_x']; + $y0 = (float)$object['bounding_box']['origin_y']; + $w = (float)$object['bounding_box']['width']; + $h = (float)$object['bounding_box']['height']; + $l = ltrim(sprintf('%.3f', $x0) ?? '', 0); + $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); + $w = ltrim(sprintf('%.3f', $w) ?? '', 0); + $h = ltrim(sprintf('%.3f', $h) ?? '', 0); + $text = ''; + foreach ($object['categories'] as $category) { + $text .= (string)($category['category_name'] ?? 'Unlabeled') . ' ~ ' . (string)sprintf('%.3f', $category['score'] ?? 0); + } + if ($notFirstWord) { + $miniocr->text(' '); + } + $notFirstWord = TRUE; + // New OCR Highlight does not like empty tags at all + if (strlen(trim($text ?? '')) > 0) { + $miniocr->startElement("w"); + $miniocr->writeAttribute("x", + $l . ' ' . $t . ' ' . $w . ' ' . $h); + $miniocr->text($text); + // Only assume we have at least one word for tags + // Since lines? could end empty? + $atleastone_word = TRUE; + $miniocr->endElement(); + } $miniocr->endElement(); } - $miniocr->endElement(); } $miniocr->endElement(); $miniocr->endElement(); From aba8b7618e3d23a7063ef1905992fa5c530f8a35 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 24 May 2024 11:54:46 -0400 Subject: [PATCH 28/44] You made typos 3 years ago? "consume this ouput" --- .../QueueWorker/AbstractPostProcessorQueueWorker.php | 2 +- .../FrictionlessDataPackagePostProcessor.php | 2 +- .../JsonFileSequencePostProcessor.php | 2 +- .../MLMobileNetPostProcessor.php | 3 +++ .../MLYoloPostProcessor.php | 4 ++++ .../StrawberryRunnersPostProcessor/OcrPostProcessor.php | 2 +- .../SubtitlePostProcessor.php | 2 +- .../SystemBinaryPostProcessor.php | 2 +- .../StrawberryRunnersPostProcessor/TextPostProcessor.php | 2 +- .../WarcExtractionPostProcessor.php | 2 +- .../WebPageTextPostProcessor.php | 2 +- .../abstractMLPostProcessor.php | 8 ++++---- .../StrawberryRunnersPostProcessorPluginManager.php | 3 +-- 13 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index 6aef559..acbd487 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -450,7 +450,7 @@ public function processItem($data) { $input_property = $plugin_info['plugin_definition']['input_property']; $input_argument = $plugin_info['plugin_definition']['input_argument']; //@TODO check if this are here and not null! - // $io->ouput will contain whatever the output is + // $io->output will contain whatever the output is // We will check if the child processor // contains a property contained in $output // If so we check if there is a single value or multiple ones diff --git a/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php index 92d087f..6dc2734 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php @@ -122,7 +122,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php index 9c59096..b19c7ff 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php @@ -182,7 +182,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // We use the actual file UUID to as part of the ID // e.g default_solr_index-strawberryfield_flavor_datasource/5801:1:en:1e9f687c-e29e-4c23-91ba-655d9c5cdfe6:ocr // For the general ID we will use this number when there are multiple siblings - // or 1 if the File is a single ouput + // or 1 if the File is a single output $sequence_number[] = $io->input->metadata['sequence']; } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php index d049bad..e704778 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php @@ -194,4 +194,7 @@ public function callImageML($image_url, $labels):mixed { return $ML; } + public function callTextML($text, $query):mixed { + return FALSE; + } } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php index 010167f..95978b8 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -184,4 +184,8 @@ public function callImageML($image_url, $labels):mixed { return $ML; } + public function callTextML($text, $query):mixed { + return FALSE; + } + } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index 88f5d97..41ac994 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -205,7 +205,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output. This plugin always generates also search API output data.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php index 3161e6b..1dc3877 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php @@ -142,7 +142,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php index 94bfc90..2d5aadb 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php @@ -142,7 +142,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php index 4455bc5..03ccf04 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php @@ -141,7 +141,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php index 68bfb76..214af14 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php @@ -112,7 +112,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'plugin' => 'As Input for another processor Plugin', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination']))? $this->getConfiguration()['output_destination']: [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php index aebd46a..cbd08f8 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php @@ -84,7 +84,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output. This plugin always generates also search API output data.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php index 4fe1418..12085f4 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -46,7 +46,7 @@ public function defaultConfiguration() { ]; public const ML_TEXT_VECTOR_SIZE = [ - '/text/bert' => 384, + '/text/sentence_transformer' => 384, ]; protected $nlp_client = null; @@ -155,7 +155,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for ML Vector Comparison)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; @@ -264,7 +264,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug if (in_array($config['source_type'], ['asstructure']) && isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { $mloutput = $this->runImageMLfromIIIF($io, $nlp); - $io->output = $mloutput ?? $output;; + $io->output = $mloutput ?? $output; } elseif (in_array($config['source_type'], ['ado', 'json']) && $node_uuid) { $mloutput = $this->runTextMLfromMetadata($io, $nlp); @@ -314,7 +314,7 @@ public function getVectorMLInfo() { } abstract public function callImageML($image_url, $labels):mixed; - + abstract public function callTextML($text, $query):mixed; protected function getNLPClient() { if ($this->nlp_client) { diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginManager.php b/src/Plugin/StrawberryRunnersPostProcessorPluginManager.php index 606247e..a453c97 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginManager.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginManager.php @@ -39,5 +39,4 @@ public function __construct( $this->setCacheBackend($cache_backend,'strawberry_runners_strawberryrunnerspostprocessor_plugins'); } - -} \ No newline at end of file +} From 0b9e503fdd96246fd8f0dbe87a0d4843cff46b50 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 24 May 2024 11:56:48 -0400 Subject: [PATCH 29/44] Put the right time this was created. Not 2022 --- .../MLMobileNetPostProcessor.php | 4 ++-- .../MLYoloPostProcessor.php | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php index e704778..d8f3f0e 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php @@ -2,8 +2,8 @@ /** * Created by PhpStorm. * User: dpino - * Date: 11/18/22 - * Time: 2:01 PM + * Date: 05/22/24 + * Time: 8:07AM */ namespace Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php index 95978b8..45f7c94 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -1,10 +1,10 @@ Date: Sat, 25 May 2024 19:20:54 -0400 Subject: [PATCH 30/44] Small updates. Insightface is working now One annoying thing is that because how i built Runners a runner can not generate (without a pager) multiple Flavors. Insightface so far is the one where embeddings have a closer alignment with the logic of detecting a face. The main features of a faces are encoded in the Vector which in test allowed me to even get family members (tested with myself and mom!) but also means that if a single image has multiple detections i need multiple Vectors, and Solr allows on vector per field/per document. Will explore the one to many option in our code @alliomeria --- .../AbstractPostProcessorQueueWorker.php | 2 - .../MLInsightfacePostProcessor.php | 194 ++++++++++++++++++ .../abstractMLPostProcessor.php | 6 +- 3 files changed, 196 insertions(+), 6 deletions(-) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index acbd487..e77c7c2 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -237,11 +237,9 @@ public function processItem($data) { return; } - // Get the whole processing chain $childprocessorschain = $this->getChildProcessorIds($data->plugin_config_entity_id ?? '', true); - $needs_localfile_cleanup = FALSE; // If a child processor at any level will eventually chain up to a leaf (means generate queue items again) $will_chain_future = FALSE; // Just in case someone decides to avoid setting this one up diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php new file mode 100644 index 0000000..306767c --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php @@ -0,0 +1,194 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => '/image/insightface', + ] + parent::defaultConfiguration(); + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + $element = parent::settingsForm($parents, $form_state); + return $element; + } + + protected function runTextMLfromMetadata($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + return $output; + } + + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + $config = $this->getConfiguration(); + $input_argument = $this->pluginDefinition['input_argument']; + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; + $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; + if (!($width && $height)) { + $width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL; + $height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL; + } + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget( isset($io->input->metadata['url']) ? $io->input->metadata['url'] : NULL) + ); + + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return $output; + } + /// Mobilenet does its own (via mediapipe) image scalling. So we can pass a smaller if needed. Internally + /// it uses 480 x 480 but not good to pass square bc it makes % bbox calculation harder. + // But requires us to call info.json and pre-process the sizes. + $iiif_image_url = $config['iiif_server']."/{$iiifidentifier}/full/full/0/default.jpg"; + //@TODO we are not filtering here by label yet. Next release. + $labels = []; + $page_text = NULL; + $output->plugin = NULL; + $labels = []; + $ML = $this->callImageML($iiif_image_url,$labels); + $output->searchapi['vector_512'] = isset($ML['insightface']['vector']) && is_array($ML['insightface']['vector']) && count($ML['insightface']['vector'])== 512 ? $ML['insightface']['vector'] : NULL; + if (isset($ML['insightface']['objects']) && is_array($ML['insightface']['objects']) && count($ML['insightface']['objects']) > 0 ) { + $miniocr = $this->insightfacenetToMiniOCR($ML['insightface']['objects'], $width, $height, $sequence_number); + $output->searchapi['fulltext'] = $miniocr; + $output->plugin = $miniocr; + $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", + PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; + // What is a good confidence ratio here? + // based on the % of the bounding box? + // Just the value? + $labels['Face'] = 'Face'; + } + $output->searchapi['metadata'] = $labels; + $output->searchapi['service_md5'] = isset($ML['insightface']['modelinfo']) ? md5(json_encode($ML['insightface']['modelinfo'])) : NULL; + $output->searchapi['plaintext'] = $page_text ?? ''; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("Insightface ML Image Embeddings & Vectors") . ' ' . $sequence_number; + return $output; + } + + + protected function insightfacenetToMiniOCR(array $objects, $width, $height, $pageid) { + $miniocr = new \XMLWriter(); + $miniocr->openMemory(); + $miniocr->startDocument('1.0', 'UTF-8'); + $miniocr->startElement("ocr"); + $atleastone_word = FALSE; + // To avoid divisions by 0 + $pwidth = (float) $width; + $pheight = (float) $height; + // Format here is again different. Instead of normalizing on Python we do here? + // @TODO make all methods in python act the same + // :[{"bbox":[x1,y1,x2,y2],"score":0.8881509304046631}] + // We are not using labels here. We have age, gender. Discriminatory! + // NOTE: floats are in the form of .1 so we need to remove the first 0. + $miniocr->startElement("p"); + $miniocr->writeAttribute("xml:id", 'ml_insightface_' . $pageid); + $miniocr->writeAttribute("wh", + ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); + $miniocr->startElement("b"); + foreach ($objects as $object) { + $notFirstWord = FALSE; + if ($object['bbox'] ?? FALSE) { + $miniocr->startElement("l"); + $x0 = (float)$object['bbox'][0]; + $y0 = (float)$object['bbox'][1]; + $w = (float)$object['bbox'][2]- $x0; + $h = (float)$object['bbox'][3] -$y0; + $l = ltrim(sprintf('%.3f', $x0) ?? '', 0); + $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); + $w = ltrim(sprintf('%.3f', $w) ?? '', 0); + $h = ltrim(sprintf('%.3f', $h) ?? '', 0); + $text .= (string)('Face') . ' ~ ' . (string)sprintf('%.3f', $object['score'] ?? 0); + + if ($notFirstWord) { + $miniocr->text(' '); + } + $notFirstWord = TRUE; + // New OCR Highlight does not like empty tags at all + if (strlen(trim($text ?? '')) > 0) { + $miniocr->startElement("w"); + $miniocr->writeAttribute("x", + $l . ' ' . $t . ' ' . $w . ' ' . $h); + $miniocr->text($text); + // Only assume we have at least one word for tags + // Since lines? could end empty? + $atleastone_word = TRUE; + $miniocr->endElement(); + } + $miniocr->endElement(); + } + } + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endDocument(); + if ($atleastone_word) { + return $miniocr->outputMemory(TRUE); + } + else { + return StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; + } + } + + public function callImageML($image_url, $labels):mixed { + $nlpClient = $this->getNLPClient(); + $config = $this->getConfiguration(); + $arguments['iiif_image_url'] = $image_url; + //@TODO we are not filtering here by label yet. Next release. + $arguments['labels'] = $labels; + $ML = $nlpClient->get_call($config['ml_method'], $arguments, 1); + return $ML; + } + + public function callTextML($text, $query):mixed { + return FALSE; + } +} diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php index 12085f4..6fc4774 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -42,7 +42,7 @@ public function defaultConfiguration() { public const ML_IMAGE_VECTOR_SIZE = [ '/image/yolo' => 576, '/image/mobilenet' => 1024, - '/image/insightfacet' => 512, + '/image/insightface' => 512, ]; public const ML_TEXT_VECTOR_SIZE = [ @@ -186,7 +186,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '/image/yolo' => 'yolov8 (Image Object detection (as MiniOCR Annotations) & embedding as a Unit Length Vector)', '/image/mobilenet' => 'MobileNet (Image embeddings as a a Unit Length Vector)', '/text/bert' => 'Bert (text embeddings as a Unit Length Vector)', - '/image/insightface' => 'InsightFace (Detection only as MiniOCR Annotations)', + '/image/insightface' => 'InsightFace (Detection as MiniOCR Annotations and embedding as a Unit Length Vector)', ], '#default_value' => $this->getConfiguration()['ml_method'], '#description' => $this->t('The ML endpoint/Model. Depending on the choice the actual value/size of data ingested will vary.'), @@ -260,8 +260,6 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug && is_array($capabilities) && is_array($capabilities['web64']['endpoints']) && in_array($config['ml_method'], $capabilities['web64']['endpoints'])) { - - if (in_array($config['source_type'], ['asstructure']) && isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { $mloutput = $this->runImageMLfromIIIF($io, $nlp); $io->output = $mloutput ?? $output; From 257aa9e147873c4176ed753b68c78e48802fdfe9 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 26 May 2024 09:38:57 -0400 Subject: [PATCH 31/44] Adds KNN Text exposed filter --- .../filter/StrawberryRunnersMLTextfilter.php | 564 ++++++++++++++++++ 1 file changed, 564 insertions(+) create mode 100644 src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php diff --git a/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php new file mode 100644 index 0000000..0439d93 --- /dev/null +++ b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php @@ -0,0 +1,564 @@ +setSbrEntityStorage( + $container->get('entity_type.manager')->getStorage('strawberry_runners_postprocessor') + ); + $plugin->setFieldsHelper($container->get('search_api.fields_helper')); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setCache($container->get('cache.default')); + $plugin->currentUser = $container->get('current_user'); + $plugin->strawberryRunnerUtilityService = $container->get( + 'strawberry_runner.utility' + ); + $plugin->strawberryRunnerProcessorPluginManager = $container->get( + 'strawberry_runner.processor_manager' + ); + return $plugin; + } + + + /** + * {@inheritdoc} + */ + public function defineOptions() { + $options = parent::defineOptions(); + $options['value']['default'] = []; + $options['sbf_fields'] = ['default' => NULL]; + $options['pre_query'] = ['default' => TRUE]; + $options['pre_query_facets'] = ['default' => TRUE]; + $options['topk'] = ['default' => 3]; + $options['ml_strawberry_postprocessor'] = ['default' => NULL]; + return $options; + } + + public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage): StrawberryRunnersMLImagefilter + { + $this->sbrEntityStorage = $sbrEntityStorage; + return $this; + } + + protected function canBuildGroup() { + return FALSE; + } + + /** + * {@inheritdoc} + */ + public function defaultExposeOptions() { + parent::defaultExposeOptions(); + $this->options['expose']['reduce'] = FALSE; + } + + protected function valueSubmit($form, FormStateInterface $form_state) { + $form_state = $form_state; + } + + protected function valueValidate($form, FormStateInterface $form_state) { + $form_state->setValue(['options', 'value'], []); + } + + + public function setFieldsHelper(FieldsHelperInterface $fieldsHelper) { + $this->fieldsHelper = $fieldsHelper; + return $this; + } + + /** + * Sets the View Storage. + * + * @param \Drupal\Core\Entity\EntityStorageInterface $viewstorage + * The view Storage. + * + * @return $this + */ + public function setViewStorage(EntityStorageInterface $viewstorage) { + $this->viewStorage = $viewstorage; + return $this; + } + + /** + * Sets the Cache Backed. + * + * @param \Drupal\Core\Cache\CacheBackendInterface $cache + * The cache backend. Use to store complex calculations of property paths. + * + * @return $this + */ + public function setCache(CacheBackendInterface $cache) { + $this->cache = $cache; + return $this; + } + + public function showOperatorForm(&$form, FormStateInterface $form_state) { + } + + /** + * {@inheritdoc} + */ + public function buildOptionsForm(&$form, FormStateInterface $form_state) { + parent::buildOptionsForm($form, $form_state); + $active_plugins = $this->strawberryRunnerUtilityService->getActivePluginConfigs(FALSE); + + foreach ($active_plugins as $by_source => $plugins) { + foreach ($plugins as $entity_id => $active_plugin) { + if (isset($active_plugin['ml_method'])) { + if (in_array($active_plugin['ml_method'], array_keys(abstractMLPostProcessor::ML_TEXT_VECTOR_SIZE))){ + $post_processor_options[$entity_id] = $active_plugin['ml_method'] . "({$entity_id})"; + } + } + } + } + + $fields = $this->getSbfDenseVectorFields() ?? []; + $form['sbf_fields'] = [ + '#type' => 'select', + '#title' => $this->t( + 'KNN Dense Vector Field to query against' + ), + '#description' => $this->t( + 'Select the fields that will be used to query against.' + ), + '#options' => $fields, + '#multiple' => FALSE, + '#default_value' => $this->options['sbf_fields'], + '#required' => TRUE, + ]; + $form['pre_query'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query'], + '#title' => $this->t('Treat previous filters to this as prequeries'), + '#description'=> $this->t( + 'If any other filter setup before this one will be treated as pre-queries to the actual KNN query.' + ), + ]; + $form['pre_query_facets'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query_facets'], + '#title' => $this->t('Treat also facets, if any, as prequeries'), + '#description'=> $this->t( + 'If any other facets will be treated as pre-queries to the actual KNN query.' + ), + ]; + $form['topk'] = [ + '#type' => 'number', + '#default_value' => $this->options['topk'], + '#title' => $this->t('Top Similarity KNN hits to request to the backend.'), + '#description'=> $this->t( + 'The more, the slower' + ), + '#min' => 1, + '#max' => 100, + ]; + $form['ml_strawberry_postprocessor'] = [ + '#type' => 'select', + '#title' => $this->t( + 'Strawberry Runners processor to extract the on-the fly embedding' + ), + '#description' => $this->t( + 'Select the ML Strawberry Runners Processor that was used to index Vectors into the field you are going to search against. These need to match' + ), + '#options' => $post_processor_options, + '#multiple' => FALSE, + '#default_value' => $this->options['ml_strawberry_postprocessor'], + '#required' => TRUE, + ]; + } + /** + * Validate the options form. + */ + public function validateOptionsForm(&$form, FormStateInterface $form_state) { + // We need to validate that the selected field is of the same source/size as model that will + // be used to generate the on the fly vectors. + // So we need to load the SBR entity passed, compare the model against the constant present in + // \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE + // and then load the field and see if the source (is of the same SBFlavor property/size (vector_576, etc) + $valid = FALSE; + $options = $form_state->getValue('options'); + $processor_id = $options['ml_strawberry_postprocessor'] ?? NULL; + $field_id = $options['sbf_fields']; + if ($processor_id) { + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Note, we could also restrict to the same image mimetypes that the processor is setup to handle? + if (isset($sbr_config['ml_method'])) { + $vector_size = abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']] ?? ''; + $field_info = $this->getSbfDenseVectorFieldSource($field_id); + if ($field_info) { + // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. + $propath_pieces = explode('/', $field_info->getCombinedPropertyPath()); + if (!(end($propath_pieces) == 'vector_' .$vector_size && $field_info->getType() == 'densevector_' . $vector_size)) { + $form_state->setErrorByName('options][ml_strawberry_postprocessor', $this->t('The Field/Processor combination is not right. Make sure your Configured KNN Dense Vector Field and the Strawberry Processor are targeting the same Vector Dimensions (e.g first one is from a vector_576 data source property and the field type is densevector_576 and the processor is calling YOLO)')); + } + } + else { + // The field is gone. + $form_state->setErrorByName('options][sbf_fields', $this->t('CConfigured KNN Dense Vector Field does not longer exists. Please replace your config with a valid/indexed field.')); + } + } + } + } + } + + public function submitOptionsForm(&$form, FormStateInterface $form_state) { + parent::submitOptionsForm( + $form, $form_state + ); + } + + protected function valueForm(&$form, FormStateInterface $form_state) { + // At this stage $this->value is not set? + $this->value = is_array($this->value) ? $this->value : (array) $this->value; + if (!$form_state->get('exposed')) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('JSON used to query internal form'), + '#prefix' => '
', + '#suffix' => '
' + ]; + } + elseif ($this->isExposed()) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('JSON used to query public form'), + '#prefix' => '
', + '#suffix' => '
' + ] ; + } + } + + public function hasExtraOptions() { + return FALSE; + } + + /** + * @inheritDoc + */ + protected function operatorForm(&$form, FormStateInterface $form_state) { + parent::operatorForm($form, $form_state); // TODO: Change the autogenerated stub + } + + + /** + * {@inheritdoc} + */ + public function buildExposeForm(&$form, FormStateInterface $form_state) { + parent::buildExposeForm($form, $form_state); + unset($form['expose']['reduce']); + } + + + public function query() { + if (empty($this->value) || empty($this->validated_exposed_input) || !$this->getQuery()) { + // basically not validated, not present as a value and also someone cancelled/nuklled the query before? + return; + } + /* + * $this->value = {stdClass} + iiif_image_id = "s3://3b9%2Fimage-dcpl-p034-npsncr-00015-rexported-f2c69aeb-7bcb-434a-a781-e580cb3695b7.tiff" + bbox = {stdClass} + x = {float} 0.0 + y = {float} 0.0 + w = {float} 1.0 + h = {float} 1.0 + */ + // Just to be sure here bc we have our own way. Who knows if some external code decides to alter the value + $this->value = $this->validated_exposed_input; + // We should only be at this stage if we have validation + if (is_array($this->value) && !empty($this->value)) { + $this->value = array_values($this->value); + $this->value = reset($this->value); + } + if (is_string($this->value)) { + $this->value = trim($this->value); + } + if (empty($this->value)) { + return; + } + + + $processor_id = $this->options['ml_strawberry_postprocessor']; + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Now we need to actually generate an instance of the runner using the config + $entity_id = $plugin_config_entity->id(); + $configuration_options = $plugin_config_entity->getPluginconfig(); + $configuration_options['configEntity'] = $entity_id; + /* @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface $plugin_instance */ + $plugin_instance + = $this->strawberryRunnerProcessorPluginManager->createInstance( + $plugin_config_entity->getPluginid(), + $configuration_options + ); + if ($plugin_instance instanceof abstractMLPostProcessor) { + try { + $response = $plugin_instance->callTextML($this->value, TRUE); + } + catch (\Exception $exception) { + // Give user feedback + return; + } + if (!empty($response['error'])) { + // we should log this + return; + } + elseif (isset($response['message'])) { + // Now here is an issue. Each endpoint will return the vector inside a yolo/etc. + // We should change that and make it generic (requires new pythong code/rebuilding NLP container) + // so for now i will use the ml method config split/last to get the right key. + foreach (["error","message","web64"] as $remove) { + unset($response[$remove]); + } + $all_knns = $this->getQuery()->getOption('sbf_knn') ?? []; + foreach ($response as $endpoint_key => $values) { + if (isset($values['vector']) && is_array($values['vector']) && count($values['vector']) == abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']]) { + $all_knns[] = $this->buildKNNQuery($this->getQuery(), $values['vector']); + } + } + array_filter($all_knns); + if (count($all_knns)) { + $this->getQuery()->setOption('sbf_knn', $all_knns); + } + } + } + } + return; + } + + + public function validate() { + + // For values passed by direct reference we will require/assume + // $json_for_url = base64_encode(gzcompress($json)); + // And this operation will happen on reading/setting back and forth. + $errors = parent::validate(); + if (is_array($this->value)) { + if ($this->options['exposed'] && !$this->options['expose']['required'] + && empty($this->value) + ) { + // Don't validate if the field is exposed and no default value is provided. + return $errors; + } + // Choose different kind of output for 0, a single and multiple values. + if (count($this->value) == 0) { + $errors[] = $this->t( + 'No valid values found on filter: @filter.', + ['@filter' => $this->adminLabel(TRUE)] + ); + } + } + return $errors; + } + + public function validateExposed(&$form, FormStateInterface $form_state) { + // Only validate exposed input. + // In theory this is where i can alter the actual form state input + // to set a different URL argument? compress? + if (empty($this->options['exposed']) + || empty($this->options['expose']['identifier']) + ) { + return; + } + + $this->validated_exposed_input = NULL; + $identifier = $this->options['expose']['identifier']; + $input = $form_state->getValue($identifier); + if (is_string($input)) { + trim($input); + if (strlen($input) == 0) { + return; + } + } + $values = $input; + if ($values) { + $this->validated_exposed_input = $values; + } + } + + + + public function acceptExposedInput($input) { + // Called during the form submit itself.. + $rc = parent::acceptExposedInput($input); + // a False means it won't be included/alter the generated query. + // This is useful! + if ($rc) { + // If we have previously validated input, override. + if (isset($this->validated_exposed_input)) { + $this->value = $this->validated_exposed_input; + } + else { + $this->value = NULL; + } + } + return $rc; + } + + /** + * @inheritDoc + */ + public function submitExposed(&$form, FormStateInterface $form_state) + { + parent::submitExposed($form, $form_state); // TODO: Change the autogenerated stub + } + + + /** + * Retrieves a list of all fields that contain in its path a Node Entity. + * + * @return string[] + * An options list of field identifiers mapped to their prefixed + * labels. + */ + protected function getSbfDenseVectorFields() { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + + $fields_info = $index->getFields(); + foreach ($fields_info as $field_id => $field) { + //if (($field->getDatasourceId() == 'strawberryfield_flavor_datasource') && ($field->getType() == "integer")) { + // Anything except text, fulltext or any solr_text variations. Also skip direct node id and UUIDs which would + // basically return the same ADO as input filtered, given that those are unique. + $property_path = $field->getPropertyPath(); + $datasource_id = $field->getDatasourceId(); + if (str_starts_with($field->getType(), 'densevector_') === TRUE) { + $field->getDataDefinition(); + $fields[$field_id] = $field->getPrefixedLabel() . '('. $field->getFieldIdentifier() .')'; + } + } + return $fields; + } + + protected function getSbfDenseVectorFieldSource($field_id) { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + $fields_info = $index->getField($field_id); + return $fields_info; + } + + /** + * @param \Drupal\search_api\Plugin\views\query\SearchApiQuery $query + * + * @throws \Drupal\search_api\SearchApiException + */ + protected function buildKNNQuery(SearchApiQuery $query, array $vector=[]):array|null { + // We can only use Solr kids. + $solr_query_string = []; + $backend = $query->getIndex()->getServerInstance()->getBackend(); + if (!($backend instanceof \Drupal\search_api_solr\SolrBackendInterface)) { + return FALSE; + } + $allfields_translated_to_solr = $backend + ->getSolrFieldNames($query->getIndex()); + if (isset($allfields_translated_to_solr[$this->options['sbf_fields']])) { + $solr_query_string[] = "{!knn f={$allfields_translated_to_solr[$this->options['sbf_fields']]} topK={$this->options['topk']}}[" . implode(', ', $vector) . ']'; + // {!knn f=vector topK=3}[-9.01364535e-03, -7.26634488e-02, -1.73818860e-02, ..., -1.16323479e-01] + } + return $solr_query_string; + } +} From f0ccaee488a6616c54944a3b9902af5c8f5bfb84 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 26 May 2024 17:11:41 -0400 Subject: [PATCH 32/44] Sbert Filter and Tiny Fix on ML Image Filter (the one for debugging) --- .../AbstractPostProcessorQueueWorker.php | 113 +++++++++++++----- .../filter/StrawberryRunnersMLImagefilter.php | 2 +- .../filter/StrawberryRunnersMLTextfilter.php | 6 +- 3 files changed, 89 insertions(+), 32 deletions(-) diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index e77c7c2..5d59925 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -319,7 +319,7 @@ public function processItem($data) { $translations = $entity->getTranslationLanguages(); foreach ($translations as $translation_id => $translation) { // checksum and file->uuid apply even if the source is not a local-ized/ensure local file. - // But we will have to change this if we plan on indexing JSON RAW directly as an vector embedding. + // But we might want to review this if we plan on indexing JSON RAW/metadata directly as an vector embedding. $item_id = $entity->id() . ':' . $sequence_key . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; // a single 0 as return will force us to reindex. $inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes); @@ -329,7 +329,7 @@ public function processItem($data) { // Check if we already have this entry in Solr if ($inindex !== 0 && !$data->force) { - $this->logger->log(LogLevel::INFO, 'Flavor already in index for @plugin on ADO Node ID @nodeid, not forced, so skipping.', + $this->logger->log(LogLevel::INFO, 'Flavor already in index for @plugin on ADO Node ID @nodeid, not forced, so skipping or chaining.', [ '@plugin' => $processor_instance->getPluginId(), '@nodeid' => $data->nid, @@ -337,6 +337,12 @@ public function processItem($data) { ); } $inkeystore = TRUE; + + // For now keeping a single language. Processor might not be aware of other languages for chaining indexed? + // Reason is even if we iterate over each language, $toindex == 1. Always the same. + // @TODO May 2024. Re-Review this in Flavor Data Source provider. We could save ourself a lot of KeyStore element.s + $processed_data_for_chaining = NULL; + // Skip file if element for every language is found in key_value collection. foreach ($item_ids as $item_id) { $processed_data = $this->keyValue->get($keyvalue_collection) @@ -346,7 +352,36 @@ public function processItem($data) { $processed_data->checksum != $data->metadata['checksum']) { $inkeystore = $inkeystore && FALSE; } + else { + // I am keeping a single one here. Should we discern by language for chaining? + // @TODO analize what it means for us. + $processed_data_for_chaining = $processed_data; + } + } + // May 2024. Allow a Processor that is to be indexed, already was processed and has data in the key store + // To use that data as input for a child one, if chained too. But only if nothing has set $io->output->plugin before + // This is needed for Processors (e.g OCR) that have already processed everything and then get a new chained + // Child that was never processed before. Would be terrible to have to re-process OCR completely just to get + // A Child to trigger. We will only provide only $io->input->plugin['searchapi'] bc that is what we know + // Any other type of child won't be able to feed from pre-existing. + if ($inkeystore && $tobechained && !$data->force && $processed_data_for_chaining!=NULL && (!isset($io->output->plugin) || !empty($io->output->plugin))) { + // Since we don't know at all what $io->output->plugin should contain + // We will pass the keystore value into $io->output->plugin and let the Processor itself (needs to have that logic) + // Deal with this use case. + $this->logger->log(LogLevel::INFO, 'Chaining @plugin on ADO Node ID @nodeid with preexisting data to the next one.', + [ + '@plugin' => $processor_instance->getPluginId(), + '@nodeid' => $data->nid, + ] + ); + if (!$io) { + $io= new \stdClass(); + $io->output = new \stdClass(); + $io->output->plugin = []; + } + $io->output->plugin['searchapi'] = $processed_data_for_chaining; } + // Allows a force in case of corrupted key value? Partial output // External/weird data? @@ -443,10 +478,19 @@ public function processItem($data) { continue ; } $childdata = clone $data; // So we do not touch original data + //@TODO. What if we want to force a child object only? + // We could IF the Child Object depends only on searchapi. + // Requires a Change in our SBR Trigger VBO plugin + // @TODO ask Allison. We might need a VBO processor to delete, selectively Flavors from Key/Solr too. + // Only way of A) removing Bias/bad vectors/Even bad OCR> And the processor should be also be able to mark + // ap:task no ML etc + /* if ($plugin_info['plugin_definition']['id'] ?? NULL == 'ml_sentence_transformer') { + $childdata->force = TRUE; + }*/ /* @var $strawberry_runners_postprocessor_config \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity */ $postprocessor_config_entity = $plugin_info['config_entity']; - $input_property = $plugin_info['plugin_definition']['input_property']; - $input_argument = $plugin_info['plugin_definition']['input_argument']; + $input_property = $plugin_info['plugin_definition']['input_property'] ?? NULL; + $input_argument = $plugin_info['plugin_definition']['input_argument'] ?? NULL; //@TODO check if this are here and not null! // $io->output will contain whatever the output is // We will check if the child processor @@ -457,7 +501,7 @@ public function processItem($data) { // - Can come from the original Data (most likely) // - May be overridden by the $io->output, e.g when a processor generates a file that is not part of any node $input_property_value_from_plugin = TRUE; - $input_property_value = isset($io->output->plugin) && isset($io->output->plugin[$input_property]) ? $io->output->plugin[$input_property] : NULL; + $input_property_value = $input_property && isset($io->output->plugin) && isset($io->output->plugin[$input_property]) ? $io->output->plugin[$input_property] : NULL; // If was not defined by the previous processor try from the main data. if ($input_property_value == NULL) { $input_property_value_from_plugin = FALSE; @@ -477,31 +521,44 @@ public function processItem($data) { // Warning Diego. This may lead to a null? $childdata->{$input_property} = $input_property_value; $childdata->plugin_config_entity_id = $postprocessor_config_entity->id(); - $input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? - $io->output->plugin[$input_argument] : $data->{$input_argument}; - // This is a must: Solr indexing requires a list of sequences. A single one - // will not be enqueued. - if (is_array($input_argument_value)) { - foreach ($input_argument_value as $value) { - // Here is the catch. - // Output properties may be many - // Input Properties matching always need to be one - if (!is_array($value)) { - $childdata->{$input_argument} = $value; - // The count will always be relative to this call - // Means count of how many children are being called. - $childdata->siblings = count($input_argument_value); - // In case the $input_property_value is an array coming from a plugin we may want to know if it has the same amount of values of $input_argument_value - // If so, it is many to one, and we only need the corresponding entry to this sequence - if ($input_property_value_from_plugin && - is_array($input_property_value) && - count($input_property_value) == $childdata->siblings && - isset($input_property_value[$value])) { - $childdata->{$input_property} = $input_property_value[$value]; + $input_argument_value = $input_argument && isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? + $io->output->plugin[$input_argument] : ($input_argument && isset($data->{$input_argument}) ? $data->{$input_argument} : NULL); + + // May 2024, Most cases, like Pagers (PDF page extractors) $input_argument_value will be an array, a sequence + // Leading to many children. + // But for chained processors like ML ones, e.g each OCR will generate exactly ONE ML + // using the same input property of OCR. + // So we can no longer assume/not depend on $input_argument_value as we did until 0.7.0 + if ($input_argument_value) { + if (is_array($input_argument_value)) { + foreach ($input_argument_value as $value) { + // Here is the catch. + // Output properties may be many + // Input Properties matching always need to be one + if (!is_array($value)) { + $childdata->{$input_argument} = $value; + // The count will always be relative to this call + // Means count of how many children are being called. + $childdata->siblings = count($input_argument_value); + // In case the $input_property_value is an array coming from a plugin we may want to know if it has the same amount of values of $input_argument_value + // If so, it is many to one, and we only need the corresponding entry to this sequence + if ($input_property_value_from_plugin && + is_array($input_property_value) && + count($input_property_value) == $childdata->siblings && + isset($input_property_value[$value])) { + $childdata->{$input_property} = $input_property_value[$value]; + } + Drupal::queue('strawberryrunners_process_background', TRUE) + ->createItem($childdata); } - Drupal::queue('strawberryrunners_process_background', TRUE) - ->createItem($childdata); } + } elseif (!empty($input_argument_value) && $input_property_value) { + // WE Have a single one. E.g Generated by a Double chaining. For 0.8.0 we will accept this option + $childdata->{$input_argument} = $input_argument_value; + $childdata->{$input_property} = $input_property_value; + $childdata->siblings = $childdata->siblings ?? 1; + Drupal::queue('strawberryrunners_process_background', TRUE) + ->createItem($childdata); } } } diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 1ecc767..8913b26 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -208,7 +208,7 @@ public function defineOptions() { return $options; } - public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage): StrawberryRunnersMLImagefilter + public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage) { $this->sbrEntityStorage = $sbrEntityStorage; return $this; diff --git a/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php index 0439d93..82f5fcf 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php @@ -131,7 +131,7 @@ public function defineOptions() { return $options; } - public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage): StrawberryRunnersMLImagefilter + public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage) { $this->sbrEntityStorage = $sbrEntityStorage; return $this; @@ -283,7 +283,7 @@ public function validateOptionsForm(&$form, FormStateInterface $form_state) { $sbr_config = $plugin_config_entity->getPluginconfig(); // Note, we could also restrict to the same image mimetypes that the processor is setup to handle? if (isset($sbr_config['ml_method'])) { - $vector_size = abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']] ?? ''; + $vector_size = abstractMLPostProcessor::ML_TEXT_VECTOR_SIZE[$sbr_config['ml_method']] ?? ''; $field_info = $this->getSbfDenseVectorFieldSource($field_id); if ($field_info) { // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. @@ -414,7 +414,7 @@ public function query() { } $all_knns = $this->getQuery()->getOption('sbf_knn') ?? []; foreach ($response as $endpoint_key => $values) { - if (isset($values['vector']) && is_array($values['vector']) && count($values['vector']) == abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']]) { + if (isset($values['vector']) && is_array($values['vector']) && count($values['vector']) == abstractMLPostProcessor::ML_TEXT_VECTOR_SIZE[$sbr_config['ml_method']]) { $all_knns[] = $this->buildKNNQuery($this->getQuery(), $values['vector']); } } From a43c3521269f345612f15b12f8e89e5c96500e75 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 26 May 2024 17:12:19 -0400 Subject: [PATCH 33/44] Make Sure Processors that are Indexing ALso pass the whole ->saerchapi to the output so we can chain --- .../OcrPostProcessor.php | 18 ++++++++++-------- .../SubtitlePostProcessor.php | 3 ++- .../TextPostProcessor.php | 2 +- .../WebPageTextPostProcessor.php | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index 41ac994..e393d94 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -300,12 +300,12 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $input_argument = $this->pluginDefinition['input_argument']; $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; - $config = $this->getConfiguration(); $timeout = $config['timeout']; // in seconds $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { $output = new \stdClass(); + $output->plugin = NULL; // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; setlocale(LC_CTYPE, 'en_US.UTF-8'); @@ -339,12 +339,14 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug ]); } $output->searchapi['fulltext'] = $miniocr; - $output->plugin = $miniocr; + // This is temporary. $io->output = $output; } } - //if not searchable run try to load the ADO, check if there is an as:text HOCR with the same size + //if not searchable try to load the ADO, check if there is an as:text HOCR with the same size //as the current Image and try to process, if not, run, tesseract + // @TODO. Ask Allison. If PDFAlto worked out, do we still need to check if there is an attached HOCR? + // Or does an attached HOCR always wins over PDFtoAlto? $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; // In case identify failed, we can try with flv:exif (e.g JP2s might not pass the identify test) @@ -432,7 +434,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug ]); } $output->searchapi['fulltext'] = $miniocr; - $output->plugin = $miniocr; + $io->output = $output; $external_found = TRUE; } @@ -448,9 +450,9 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } } } - // At this stage only run Tesseract if we are still without $output->plugin + // At this stage only run Tesseract if we are still without $output->searchapi['fulltext'] - if (!isset($output->plugin) || $output->plugin == NULL) { + if (!isset($output->searchapi['fulltext']) || $output->searchapi['fulltext'] == NULL) { setlocale(LC_CTYPE, 'en_US.UTF-8'); $execstring = $this->buildExecutableCommand($io); if ($execstring) { @@ -482,11 +484,10 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug ]); } $output->searchapi['fulltext'] = $miniocr; - $output->plugin = $miniocr; } } - if (!isset($output->plugin) || $output->plugin == NULL) { + if (!isset($output->searchapi['fulltext']) || $output->searchapi['fulltext'] == NULL) { // If we still have no OCR at this state it is time to bail out $this->logger->warning("@sbr_processor: HOCR to miniOCR processing from Tesseract failed for ADO with UUID @node_uuid and File with UUID @file_uuid with sequence number @sequence_id", [ @@ -604,6 +605,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $output->searchapi['processlang'] = $file_languages; $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("Sequence") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; $io->output = $output; } else { diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php index 1dc3877..672a20a 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php @@ -244,6 +244,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; if ($file_path && $file_uuid && $node_uuid) { $output = new \stdClass(); + $output->plugin = NULL; // Let's see if we need an output path or not $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; $out_file_path = NULL; @@ -266,7 +267,6 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $output->searchapi['fulltext'] = $miniocr ?? StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; - $output->plugin = $text_content; $output->searchapi['plaintext'] = $page_text; } else { @@ -413,6 +413,7 @@ function ($languages_enabled) { $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("Sequence") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; $io->output = $output; } else { diff --git a/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php index 03ccf04..47af9c8 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php @@ -286,7 +286,6 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } $output->searchapi['fulltext'] = StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; - $output->plugin = $text_content; $output->searchapi['plaintext'] = $page_text; } else { @@ -433,6 +432,7 @@ function ($languages_enabled) { $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("Sequence") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; $io->output = $output; } else { diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php index cbd08f8..513b7ed 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php @@ -291,7 +291,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $output->searchapi['processlang'] = []; $output->searchapi['label'] = $page_title; $output->searchapi['ts'] = $page_ts; - $output->plugin = $output->searchapi; + $output->plugin['searchapi'] = $output->searchapi; } else { throw new \Exception("WebPage Text was not a valid JSON."); From 7e7f25d0345a73a97bf619cf53bfc74bc05f62b3 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 26 May 2024 17:12:40 -0400 Subject: [PATCH 34/44] Cleans ML processors --- .../MLInsightfacePostProcessor.php | 4 +- .../MLMobileNetPostProcessor.php | 6 +- .../MLSentenceTransformertPostProcessor.php | 147 ++++++++++++++++++ .../MLYoloPostProcessor.php | 6 +- 4 files changed, 155 insertions(+), 8 deletions(-) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php index 306767c..85cda21 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php @@ -58,7 +58,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { return $element; } - protected function runTextMLfromMetadata($io, NlpClient $nlpClient): \stdClass { + protected function runTextMLfromJSON($io, NlpClient $nlpClient): \stdClass { $output = new \stdClass(); return $output; } @@ -97,7 +97,6 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { if (isset($ML['insightface']['objects']) && is_array($ML['insightface']['objects']) && count($ML['insightface']['objects']) > 0 ) { $miniocr = $this->insightfacenetToMiniOCR($ML['insightface']['objects'], $width, $height, $sequence_number); $output->searchapi['fulltext'] = $miniocr; - $output->plugin = $miniocr; $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; // What is a good confidence ratio here? @@ -111,6 +110,7 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { $output->searchapi['processlang'] = $file_languages; $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("Insightface ML Image Embeddings & Vectors") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; return $output; } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php index d8f3f0e..661a3a0 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php @@ -58,10 +58,10 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { return $element; } - protected function runTextMLfromMetadata($io, NlpClient $nlpClient): \stdClass { + protected function runTextMLfromJSON($io, NlpClient $nlpClient): \stdClass { $output = new \stdClass(); return $output; - // TODO: Implement runTextMLfromMetadata() method. + // TODO: Implement runTextMLfromJSON() method. } protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { @@ -98,7 +98,6 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { if (isset($ML['mobilenet']['objects']) && is_array($ML['mobilenet']['objects']) && count($ML['mobilenet']['objects']) > 0 ) { $miniocr = $this->mobilenetToMiniOCR($ML['mobilenet']['objects'], $width, $height, $sequence_number); $output->searchapi['fulltext'] = $miniocr; - $output->plugin = $miniocr; $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; // What is a good confidence ratio here? @@ -116,6 +115,7 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { $output->searchapi['processlang'] = $file_languages; $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("MobileNet ML Image Embeddings & Vectors") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; return $output; } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php new file mode 100644 index 0000000..968b9fe --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php @@ -0,0 +1,147 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => '/text/sentence_transformer', + ] + parent::defaultConfiguration(); + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + $element = parent::settingsForm($parents, $form_state); + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'json' => 'JSON passed by a parent Processor', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source file this processor needs is fetched'), + '#required' => TRUE, + ]; + $element['output_type'] = [ + '#type' => 'select', + '#title' => $this->t('The expected and desired output of this processor.'), + '#options' => [ + 'json' => 'Data/Values that can be serialized to JSON', + ], + '#default_value' => $this->getConfiguration()['output_type'], + '#description' => $this->t('If the output is just data and "One or more Files" is selected all data will be dumped into a file and handled as such.'), + ]; + $element['output_destination'] = [ + '#type' => 'checkboxes', + '#title' => $this->t("Where and how the output will be used."), + '#options' => [ + 'plugin' => 'As Input for another processor Plugin', + 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', + ], + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output. This plugin always generates also search API output data.'), + '#required' => TRUE, + ]; + return $element; + } + + protected function runTextMLfromJSON($io, NlpClient $nlpClient): \stdClass + { + $output = new \stdClass(); + $config = $this->getConfiguration(); + + $input_argument = $this->pluginDefinition['input_argument']; + $input_property = $this->pluginDefinition['input_property']; + + $file_languages = isset($io->input->lang) ? (array)$io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + $sequence_number = isset($io->input->{$input_argument}) ? (int)$io->input->{$input_argument} : 1; + + setlocale(LC_CTYPE, 'en_US.UTF-8'); + if (isset($io->input->{$input_property})) { + $page_text = $io->input->{$input_property}->plaintext ?? NULL; + if ($page_text) { + $labels = []; + $output->plugin = NULL; + $labels = []; + $ML = $this->callTextML($page_text, false); + $output->searchapi['vector_384'] = isset($ML['sentence_transformer']['vector']) && is_array($ML['sentence_transformer']['vector']) && count($ML['sentence_transformer']['vector']) == 384 ? $ML['sentence_transformer']['vector'] : NULL; + $output->searchapi['metadata'] = $io->input->{$input_property}->metadata ?? []; + $output->searchapi['service_md5'] = isset($ML['mobilenet']['modelinfo']) ? md5(json_encode($ML['mobilenet']['modelinfo'])) : NULL; + $output->searchapi['plaintext'] = $page_text ?? ''; + $output->searchapi['fulltext'] = $io->input->{$input_property}->fulltext ?? []; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("Sentence Transformer ML Text Embeddings & Vectors") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; + } + } + return $output; + } + + public function callImageML($image_url, $labels):mixed { + return FALSE; + } + + public function callTextML($text, $query = TRUE):mixed { + $nlpClient = $this->getNLPClient(); + $config = $this->getConfiguration(); + $arguments['text'] = $text; + if ($query) { + $arguments['query'] = TRUE; + } + //@TODO we are not filtering here by label yet. Next release. + $ML = $nlpClient->get_call($config['ml_method'], $arguments, 1); + return $ML; + } + + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass + { + $output = new \stdClass(); + return $output; + } + +} diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php index 45f7c94..2346345 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -58,10 +58,10 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { return $element; } - protected function runTextMLfromMetadata($io, NlpClient $nlpClient): \stdClass { + protected function runTextMLfromJSON($io, NlpClient $nlpClient): \stdClass { $output = new \stdClass(); return $output; - // TODO: Implement runTextMLfromMetadata() method. + // TODO: Implement runTextMLfromJSON() method. } protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { @@ -98,7 +98,6 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { if (isset($ML['yolo']['objects']) && is_array($ML['yolo']['objects']) && count($ML['yolo']['objects']) > 0 ) { $miniocr = $this->yoloToMiniOCR($ML['yolo']['objects'], $width, $height, $sequence_number); $output->searchapi['fulltext'] = $miniocr; - $output->plugin = $miniocr; $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; // What is a good confidence ratio here? @@ -114,6 +113,7 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { $output->searchapi['processlang'] = $file_languages; $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("ML Image Embeddings & Vectors") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; return $output; } From 5ae461c7bf3185b6d7b57e5734067909fe9b71c3 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 26 May 2024 17:12:59 -0400 Subject: [PATCH 35/44] Simplify a bit the AbstractML Processor all the other ones inherit --- .../abstractMLPostProcessor.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php index 6fc4774..1ba365a 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -185,7 +185,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#options' => [ '/image/yolo' => 'yolov8 (Image Object detection (as MiniOCR Annotations) & embedding as a Unit Length Vector)', '/image/mobilenet' => 'MobileNet (Image embeddings as a a Unit Length Vector)', - '/text/bert' => 'Bert (text embeddings as a Unit Length Vector)', + '/text/sentence_transformer' => 'SBert Sentence Transformer (text embeddings as a Unit Length Vector)', '/image/insightface' => 'InsightFace (Detection as MiniOCR Annotations and embedding as a Unit Length Vector)', ], '#default_value' => $this->getConfiguration()['ml_method'], @@ -265,7 +265,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $io->output = $mloutput ?? $output; } elseif (in_array($config['source_type'], ['ado', 'json']) && $node_uuid) { - $mloutput = $this->runTextMLfromMetadata($io, $nlp); + $mloutput = $this->runTextMLfromJSON($io, $nlp); $io->output = $mloutput ?? $output; } else { @@ -287,7 +287,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug abstract protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass; - abstract protected function runTextMLfromMetadata($io, NlpClient $nlpClient) :\stdClass; + abstract protected function runTextMLfromJSON($io, NlpClient $nlpClient) :\stdClass; // Mime types supported as input to Tesseract. // See https://github.com/tesseract-ocr/tessdoc/blob/main/InputFormats.md From 5bbf75a567cbd104dbf361a3cc420154f701ab62 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 26 May 2024 17:13:19 -0400 Subject: [PATCH 36/44] Adds the Text filter as a Filter/Plugin for Search API --- strawberry_runners.module | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/strawberry_runners.module b/strawberry_runners.module index cf7df2f..f4e2ef5 100644 --- a/strawberry_runners.module +++ b/strawberry_runners.module @@ -21,6 +21,7 @@ function strawberry_runners_views_data_alter(array &$data) { $table = &$data[$key]; $ml_image_filter = _search_api_views_find_field_alias('sbr_imageml_filter', $table); + $ml_text_filter = _search_api_views_find_field_alias('sbr_textml_filter', $table); $table[$ml_image_filter] = [ 'title' => t('Image Similarity Filter via KNN (Experimental)'), 'group' => t('Search'), @@ -40,6 +41,19 @@ function strawberry_runners_views_data_alter(array &$data) { if ($ml_image_filter != 'sbr_imageml_filter') { $table[$ml_image_filter]['real field'] = 'sbr_imageml_filter'; } + $table[$ml_text_filter] = [ + 'title' => t('Text Similarity Filter via KNN (Experimental)'), + 'group' => t('Search'), + 'help' => t('Filters one Query Phrases to the Corresponding Vector in a Strawberry Flavor Document generating on the Fly an Embedding Vector.'), + 'filter' => [ + 'title' => t('Text Similarity Filter via KNN '), + 'field' => 'id', + 'id' => 'sbr_textml_filter', + ], + ]; + if ($ml_text_filter != 'sbr_textml_filter') { + $table[$ml_image_filter]['real field'] = 'sbr_textml_filter'; + } } catch (\Exception $e) { $args = [ From 4eed5d2db00db504a2651e53d39c2f1adac09920 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Wed, 29 May 2024 09:20:42 -0400 Subject: [PATCH 37/44] If no face, then maybe just don't generate search API at all? Todo ask @alliomeria and the community about "empty completeness" v/s "absence of data" (first implies normalization, e.g doc count. == processed count), the second slimmer archipelago --- .../AbstractPostProcessorQueueWorker.php | 2 +- .../MLInsightfacePostProcessor.php | 21 ++++++++++--------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index 5d59925..2cae717 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -374,7 +374,7 @@ public function processItem($data) { '@nodeid' => $data->nid, ] ); - if (!$io) { + if (!isset($io)) { $io= new \stdClass(); $io->output = new \stdClass(); $io->output->plugin = []; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php index 85cda21..ef7e34e 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php @@ -95,6 +95,7 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { $ML = $this->callImageML($iiif_image_url,$labels); $output->searchapi['vector_512'] = isset($ML['insightface']['vector']) && is_array($ML['insightface']['vector']) && count($ML['insightface']['vector'])== 512 ? $ML['insightface']['vector'] : NULL; if (isset($ML['insightface']['objects']) && is_array($ML['insightface']['objects']) && count($ML['insightface']['objects']) > 0 ) { + // Don't do anything if no detection. $miniocr = $this->insightfacenetToMiniOCR($ML['insightface']['objects'], $width, $height, $sequence_number); $output->searchapi['fulltext'] = $miniocr; $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", @@ -103,14 +104,14 @@ protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { // based on the % of the bounding box? // Just the value? $labels['Face'] = 'Face'; + $output->searchapi['metadata'] = $labels; + $output->searchapi['service_md5'] = isset($ML['insightface']['modelinfo']) ? md5(json_encode($ML['insightface']['modelinfo'])) : NULL; + $output->searchapi['plaintext'] = $page_text ?? ''; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("Insightface ML Image Embeddings & Vectors") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; } - $output->searchapi['metadata'] = $labels; - $output->searchapi['service_md5'] = isset($ML['insightface']['modelinfo']) ? md5(json_encode($ML['insightface']['modelinfo'])) : NULL; - $output->searchapi['plaintext'] = $page_text ?? ''; - $output->searchapi['processlang'] = $file_languages; - $output->searchapi['ts'] = date("c"); - $output->searchapi['label'] = $this->t("Insightface ML Image Embeddings & Vectors") . ' ' . $sequence_number; - $output->plugin['searchapi'] = $output->searchapi; return $output; } @@ -140,13 +141,13 @@ protected function insightfacenetToMiniOCR(array $objects, $width, $height, $pag $miniocr->startElement("l"); $x0 = (float)$object['bbox'][0]; $y0 = (float)$object['bbox'][1]; - $w = (float)$object['bbox'][2]- $x0; - $h = (float)$object['bbox'][3] -$y0; + $w = (float)$object['bbox'][2] - $x0; + $h = (float)$object['bbox'][3] - $y0; $l = ltrim(sprintf('%.3f', $x0) ?? '', 0); $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); $w = ltrim(sprintf('%.3f', $w) ?? '', 0); $h = ltrim(sprintf('%.3f', $h) ?? '', 0); - $text .= (string)('Face') . ' ~ ' . (string)sprintf('%.3f', $object['score'] ?? 0); + $text = (string)('Face') . ' ~ ' . (string)sprintf('%.3f', $object['score'] ?? 0); if ($notFirstWord) { $miniocr->text(' '); From c3eba87ba69fae8a72f4e427a15da926843dbe79 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 20 Jun 2024 13:07:11 -0400 Subject: [PATCH 38/44] mark pre query as disabled/future features. We want people to know this will be possible, but for now it is easier to treat all filters are pre-queries (as we do) --- .../views/argument/StrawberryRunnersMLImageArgument.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php index e6eaa1a..3abebbf 100644 --- a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php +++ b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php @@ -221,18 +221,20 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { $form['pre_query'] = [ '#type' => 'checkbox', '#default_value' => $this->options['pre_query'], - '#title' => $this->t('Treat previous filters to this as prequeries'), + '#title' => $this->t('Treat previous filters to this as pre queries (Future Feature)'), '#description'=> $this->t( 'If any other filter setup before this one will be treated as pre-queries to the actual KNN query.' ), + '#disabled' => TRUE, ]; $form['pre_query_facets'] = [ '#type' => 'checkbox', '#default_value' => $this->options['pre_query_facets'], - '#title' => $this->t('Treat also facets, if any, as prequeries'), + '#title' => $this->t('Treat also facets, if any, as pre queries (Future Feature)'), '#description'=> $this->t( 'If any other facets will be treated as pre-queries to the actual KNN query.' ), + '#disabled' => TRUE, ]; $form['topk'] = [ '#type' => 'number', From 51a6c3db3e3cf88a733b3be28790befc78ed3263 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 20 Jun 2024 13:23:34 -0400 Subject: [PATCH 39/44] Limit this. ADO processing does not yet exist. But will. Also don't state it is YOLOv8, in the future (near) it might be yolov10, yolov11, etc. --- .../MLInsightfacePostProcessor.php | 22 ++++++++++++++++++ .../MLMobileNetPostProcessor.php | 23 +++++++++++++++++++ .../MLSentenceTransformertPostProcessor.php | 13 ++++++++++- .../MLYoloPostProcessor.php | 22 ++++++++++++++++++ .../abstractMLPostProcessor.php | 3 +-- 5 files changed, 80 insertions(+), 3 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php index ef7e34e..9ccee1f 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php @@ -55,6 +55,28 @@ public function defaultConfiguration() { public function settingsForm(array $parents, FormStateInterface $form_state) { $element = parent::settingsForm($parents, $form_state); + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source data this processor needs is fetched'), + '#required' => TRUE, + ]; + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('ML endpoint to use (fixed)'), + '#options' => [ + '/image/insightface' => 'InsightFace (Detections as MiniOCR Annotations and one embedding as a Unit Length Vector)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. This is fixed for this processor.'), + '#required' => TRUE, + ]; + // Only Images for now. + $element['jsonkey']['#options'] = [ 'as:image' => 'as:image']; return $element; } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php index 661a3a0..db23bd4 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php @@ -55,6 +55,29 @@ public function defaultConfiguration() { public function settingsForm(array $parents, FormStateInterface $form_state) { $element = parent::settingsForm($parents, $form_state); + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source data this processor needs is fetched'), + '#required' => TRUE, + ]; + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('ML endpoint to use (fixed)'), + '#options' => [ + '/image/mobilenet' => 'MobileNet (Image embeddings as a a Unit Length Vector)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. This is fixed for this processor.'), + '#required' => TRUE, + ]; + // Only Images for now. + $element['jsonkey']['#options'] = [ 'as:image' => 'as:image']; + return $element; } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php index 968b9fe..d6775a3 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php @@ -60,7 +60,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#type' => 'select', '#title' => $this->t('The type of source data this processor works on'), '#options' => [ - 'json' => 'JSON passed by a parent Processor', + 'json' => 'JSON passed by a parent Processor.This processor needs to be chained to another one that generates Text. e.g OCR.', ], '#default_value' => $this->getConfiguration()['source_type'], '#description' => $this->t('Select from where the source file this processor needs is fetched'), @@ -86,6 +86,17 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output. This plugin always generates also search API output data.'), '#required' => TRUE, ]; + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('ML endpoint to use (fixed)'), + '#options' => [ + '/text/sentence_transformer' => 'SBert Sentence Transformer (text embeddings as a Unit Length Vector)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. This is fixed for this processor.'), + '#required' => TRUE, + ]; + return $element; } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php index 2346345..cf89ee9 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -55,6 +55,28 @@ public function defaultConfiguration() { public function settingsForm(array $parents, FormStateInterface $form_state) { $element = parent::settingsForm($parents, $form_state); + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source data this processor needs is fetched'), + '#required' => TRUE, + ]; + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('ML endpoint to use (fixed)'), + '#options' => [ + '/image/yolo' => 'YOLO (Image Object detection (as MiniOCR Annotations) & embedding as a Unit Length Vector)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. This is fixed for this processor.'), + '#required' => TRUE, + ]; + // Only Images for now. + $element['jsonkey']['#options'] = [ 'as:image' => 'as:image']; return $element; } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php index 1ba365a..733e6dd 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -105,7 +105,6 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#states' => [ 'visible' => [ ':input[name="pluginconfig[source_type]"]' => ['value' => 'ado'], - ':input[name="pluginconfig[source_type]"]' => ['value' => 'json'], ], ], ]; @@ -183,7 +182,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#type' => 'radios', '#title' => $this->t('Which ML endpoint to use'), '#options' => [ - '/image/yolo' => 'yolov8 (Image Object detection (as MiniOCR Annotations) & embedding as a Unit Length Vector)', + '/image/yolo' => 'YOLO (Image Object detection (as MiniOCR Annotations) & embedding as a Unit Length Vector)', '/image/mobilenet' => 'MobileNet (Image embeddings as a a Unit Length Vector)', '/text/sentence_transformer' => 'SBert Sentence Transformer (text embeddings as a Unit Length Vector)', '/image/insightface' => 'InsightFace (Detection as MiniOCR Annotations and embedding as a Unit Length Vector)', From c491d646a43658691405eae5301d7600ec120eb3 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 20 Jun 2024 13:26:13 -0400 Subject: [PATCH 40/44] Mark facet pre queries as future feature --- src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php index 82f5fcf..d747a27 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php @@ -226,18 +226,20 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { $form['pre_query'] = [ '#type' => 'checkbox', '#default_value' => $this->options['pre_query'], - '#title' => $this->t('Treat previous filters to this as prequeries'), + '#title' => $this->t('Treat previous filters to this as pre queries (Future Feature)'), '#description'=> $this->t( 'If any other filter setup before this one will be treated as pre-queries to the actual KNN query.' ), + '#disabled' => TRUE, ]; $form['pre_query_facets'] = [ '#type' => 'checkbox', '#default_value' => $this->options['pre_query_facets'], - '#title' => $this->t('Treat also facets, if any, as prequeries'), + '#title' => $this->t('Treat also facets, if any, as pre queries (Future Feature)'), '#description'=> $this->t( 'If any other facets will be treated as pre-queries to the actual KNN query.' ), + '#disabled' => TRUE, ]; $form['topk'] = [ '#type' => 'number', From a387f0eaa75100a4b16531fd9857ca09483312f7 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 20 Jun 2024 13:27:44 -0400 Subject: [PATCH 41/44] mark Test/Internal Filter for ML pre queries as future features Also: Need to document this \Drupal\strawberry_runners\Plugin\views\filter\StrawberryRunnersMLImagefilter::IMAGEML_INPUT_SCHEMA so people testing know what the expected SHAPE of the JSON is. Again. This filter is just a way of exposing values. Programmatically (user/dev needs to know what needs to be done) one would submit a JSON with that structure @alliomeria --- src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 8913b26..860d89e 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -301,18 +301,20 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) { $form['pre_query'] = [ '#type' => 'checkbox', '#default_value' => $this->options['pre_query'], - '#title' => $this->t('Treat previous filters to this as prequeries'), + '#title' => $this->t('Treat previous filters to this as pre queries (Future Feature)'), '#description'=> $this->t( 'If any other filter setup before this one will be treated as pre-queries to the actual KNN query.' ), + '#disabled' => TRUE, ]; $form['pre_query_facets'] = [ '#type' => 'checkbox', '#default_value' => $this->options['pre_query_facets'], - '#title' => $this->t('Treat also facets, if any, as prequeries'), + '#title' => $this->t('Treat also facets, if any, as pre queries (Future Feature)'), '#description'=> $this->t( 'If any other facets will be treated as pre-queries to the actual KNN query.' ), + '#disabled' => TRUE, ]; $form['topk'] = [ '#type' => 'number', From 820f3eb5c0d57a83da71691e5e8f7495ed474d86 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 20 Jun 2024 14:20:19 -0400 Subject: [PATCH 42/44] Adds Permissions and note about Anonymous users --- strawberry_runners.permissions.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 strawberry_runners.permissions.yml diff --git a/strawberry_runners.permissions.yml b/strawberry_runners.permissions.yml new file mode 100644 index 0000000..d53663d --- /dev/null +++ b/strawberry_runners.permissions.yml @@ -0,0 +1,6 @@ +'execute Image ML queries': + title: 'Execute Image ML queries (KNN). This permission is enforced by the ML Image Views Argument Plugin and Filter Plugin' + description: 'Only users with this permission will have Image based ML Views queries executed when using ML Image Views Argument(s). For security/performance reasons, this permission has no effect on Anonymous Users.' +'execute Text ML queries': + title: 'Execute Text ML queries (KNN). This permission is enforced by the ML Text Views Filter Plugin' + description: 'Only users with this permission will have Text based ML Views queries executed when using the ML Text Views Filter. For security/performance reasons, this permission has no effect on Anonymous Users.' From adc699bbe49c73b720406f3c1bdceedc0d7595c1 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 20 Jun 2024 14:21:35 -0400 Subject: [PATCH 43/44] Apply permissions on Exposed form and Query itself. Check for Anonymous --- .../filter/StrawberryRunnersMLImagefilter.php | 12 ++++++- .../filter/StrawberryRunnersMLTextfilter.php | 34 +++++++++++-------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php index 860d89e..8296232 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -384,6 +384,11 @@ public function submitOptionsForm(&$form, FormStateInterface $form_state) { ); } + public function isExposed() + { + return parent::isExposed() && ((!$this->currentUser->isAnonymous() && $this->currentUser->hasPermission('execute Image ML queries')) || $this->currentUser->hasRole('administrator')); + } + protected function valueForm(&$form, FormStateInterface $form_state) { // At this stage $this->value is not set? $this->value = is_array($this->value) ? $this->value : (array) $this->value; @@ -400,7 +405,8 @@ protected function valueForm(&$form, FormStateInterface $form_state) { '#type' => 'textarea', '#title' => t('JSON used to query public form'), '#prefix' => '
', - '#suffix' => '
' + '#suffix' => '', + '#access' => !$this->currentUser->isAnonymous() && $this->currentUser->hasPermission('execute Image ML queries') || $this->currentUser->hasRole('administrator'), ] ; } } @@ -427,6 +433,10 @@ public function buildExposeForm(&$form, FormStateInterface $form_state) { public function query() { + if ($this->currentUser->isAnonymous() || (!$this->currentUser->hasPermission('execute Image ML queries') && !$this->currentUser->hasRole('administrator'))) { + return; + } + if (empty($this->value) || empty($this->validated_exposed_input) || !$this->getQuery()) { // basically not validated, not present as a value and also someone cancelled/nuklled the query before? return; diff --git a/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php index d747a27..b5a74bd 100644 --- a/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php +++ b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php @@ -309,23 +309,33 @@ public function submitOptionsForm(&$form, FormStateInterface $form_state) { ); } + /** + * @inheritDoc + */ + public function isExposed() + { + return parent::isExposed() && ((!$this->currentUser->isAnonymous() && $this->currentUser->hasPermission('execute Text ML queries')) || $this->currentUser->hasRole('administrator')); + } + + protected function valueForm(&$form, FormStateInterface $form_state) { // At this stage $this->value is not set? $this->value = is_array($this->value) ? $this->value : (array) $this->value; if (!$form_state->get('exposed')) { $form['value'] = [ '#type' => 'textarea', - '#title' => t('JSON used to query internal form'), + '#title' => t('Text query to be Vectorized'), '#prefix' => '
', '#suffix' => '
' ]; } - elseif ($this->isExposed()) { + elseif ($this->isExposed() ) { $form['value'] = [ '#type' => 'textarea', - '#title' => t('JSON used to query public form'), + '#title' => t('Text query to be vectorized'), '#prefix' => '
', - '#suffix' => '
' + '#suffix' => '', + '#access' => !$this->currentUser->isAnonymous() && $this->currentUser->hasPermission('execute Text ML queries') || $this->currentUser->hasRole('administrator'), ] ; } } @@ -352,19 +362,13 @@ public function buildExposeForm(&$form, FormStateInterface $form_state) { public function query() { - if (empty($this->value) || empty($this->validated_exposed_input) || !$this->getQuery()) { - // basically not validated, not present as a value and also someone cancelled/nuklled the query before? + if (empty($this->value) || empty($this->validated_exposed_input) || !$this->getQuery() || + ($this->currentUser->isAnonymous() || (!$this->currentUser->hasPermission('execute Text ML queries') && !$this->currentUser->hasRole('administrator'))) + ) { + // basically not validated, not present as a value or not the right permisisons. return; } - /* - * $this->value = {stdClass} - iiif_image_id = "s3://3b9%2Fimage-dcpl-p034-npsncr-00015-rexported-f2c69aeb-7bcb-434a-a781-e580cb3695b7.tiff" - bbox = {stdClass} - x = {float} 0.0 - y = {float} 0.0 - w = {float} 1.0 - h = {float} 1.0 - */ + // Just to be sure here bc we have our own way. Who knows if some external code decides to alter the value $this->value = $this->validated_exposed_input; // We should only be at this stage if we have validation From 9e26bda8ad3043aa74ee675d511d8bc766e169e3 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 20 Jun 2024 14:21:58 -0400 Subject: [PATCH 44/44] Apply permissions on Exposed Image Argument/Views for ML --- .../views/argument/StrawberryRunnersMLImageArgument.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php index 3abebbf..32727e9 100644 --- a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php +++ b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php @@ -316,6 +316,10 @@ public function setArgument($arg) { public function query($group_by = FALSE) { + // if the User has not this permission simply return as nothing was sent. + if ($this->currentUser->isAnonymous() || (!$this->currentUser->hasPermission('execute Image ML queries') && !$this->currentUser->hasRole('administrator'))) { + return; + } $this->argument_validated; if (empty($this->expanded_argument) || ! $this->query) { // basically not validated, not present as a value and also someone cancelled/nuklled the query before?