diff --git a/src/Annotation/StrawberryRunnersPostProcessor.php b/src/Annotation/StrawberryRunnersPostProcessor.php index 1843810..1c05afb 100644 --- a/src/Annotation/StrawberryRunnersPostProcessor.php +++ b/src/Annotation/StrawberryRunnersPostProcessor.php @@ -63,7 +63,7 @@ class StrawberryRunnersPostProcessor extends Plugin { public $input_argument; /** - * Processing stage: can be Entity PreSave or PostSave + * Processing stage: can be Entity PreSave or PostSave. Pre save is good for ADO/Metadata. Implementation to follow. * * @var string $when; * diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index 5af6e46..2cae717 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -177,6 +177,7 @@ public function processItem($data) { // every processor will work only on Files. // True for now, but eventually we want processors that do only // metadata to metadata. + if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL || !is_array($data->metadata)) { return; } @@ -201,25 +202,30 @@ public function processItem($data) { return; } - $filelocation = $this->ensureFileAvailability($file); - - if ($filelocation === FALSE) { - $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing aborted for ADO Node ID @nodeid because we could not ensure a local file location needed for @processor. You might have run out space or have permission issues or (less likely) the original File/ADO was removed milliseconds ago.', - [ - '@processor' => $processor_instance->label(), - '@nodeid' => $data->nid, - ] - ); - // Note. If $filelocation could not be acquired, means we do not need to compost neither - // its already gone/not possible - return; + // We only need to ensure $file if we are going to use the actual file for processing. + if ($processor_instance->getPluginDefinition()['input_property'] == 'filepath') { + $filelocation = $this->ensureFileAvailability($file); + if ($filelocation === FALSE) { + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing aborted for ADO Node ID @nodeid because we could not ensure a local file location needed for @processor. You might have run out space or have permission issues or (less likely) the original File/ADO was removed milliseconds ago.', + [ + '@processor' => $processor_instance->label(), + '@nodeid' => $data->nid, + ] + ); + // Note. If $filelocation could not be acquired, means we do not need to compost neither + // its already gone/not possible + return; + } + // Means we could pass also a file directly anytime. But not really as such + // only into $data->filepath but not into $filelocation bc + // that would compost and remove the file. What if its needed later? + $data->filepath = $filelocation; + // We preset it up here. + $this->instanceFiles = [$filelocation]; + } + else { + $data->filepath = NULL; } - // Means we could pass also a file directly anytime. But not really as such - // only into $data->filepath but not into $filelocation bc - // that would compost and remove the file. What if its needed later? - $data->filepath = $filelocation; - // We preset it up here. - $this->instanceFiles = [$filelocation]; if (!isset($processor_config['output_destination']) || !is_array($processor_config['output_destination'])) { $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing aborted for ADO Node ID @nodeid because there is no output destination setup for @processor', @@ -231,11 +237,9 @@ public function processItem($data) { return; } - // Get the whole processing chain $childprocessorschain = $this->getChildProcessorIds($data->plugin_config_entity_id ?? '', true); - $needs_localfile_cleanup = FALSE; // If a child processor at any level will eventually chain up to a leaf (means generate queue items again) $will_chain_future = FALSE; // Just in case someone decides to avoid setting this one up @@ -255,7 +259,7 @@ public function processItem($data) { // If not cleaned up before // AND won't chain in the future - $needs_localfile_cleanup = !$will_chain_future && !$data->sbr_cleanedup_before; + $needs_localfile_cleanup = !$will_chain_future && !$data->sbr_cleanedup_before && $processor_instance->getPluginDefinition()['input_property'] == 'filepath'; // We set this before triggering cleanup, means future thinking // bc we need to make sure IF there is a next processor it will get // The info that during this queuworker processing cleanup at the end @@ -314,6 +318,8 @@ public function processItem($data) { if (is_a($entity, TranslatableInterface::class)) { $translations = $entity->getTranslationLanguages(); foreach ($translations as $translation_id => $translation) { + // checksum and file->uuid apply even if the source is not a local-ized/ensure local file. + // But we might want to review this if we plan on indexing JSON RAW/metadata directly as an vector embedding. $item_id = $entity->id() . ':' . $sequence_key . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; // a single 0 as return will force us to reindex. $inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes); @@ -323,7 +329,7 @@ public function processItem($data) { // Check if we already have this entry in Solr if ($inindex !== 0 && !$data->force) { - $this->logger->log(LogLevel::INFO, 'Flavor already in index for @plugin on ADO Node ID @nodeid, not forced, so skipping.', + $this->logger->log(LogLevel::INFO, 'Flavor already in index for @plugin on ADO Node ID @nodeid, not forced, so skipping or chaining.', [ '@plugin' => $processor_instance->getPluginId(), '@nodeid' => $data->nid, @@ -331,6 +337,12 @@ public function processItem($data) { ); } $inkeystore = TRUE; + + // For now keeping a single language. Processor might not be aware of other languages for chaining indexed? + // Reason is even if we iterate over each language, $toindex == 1. Always the same. + // @TODO May 2024. Re-Review this in Flavor Data Source provider. We could save ourself a lot of KeyStore element.s + $processed_data_for_chaining = NULL; + // Skip file if element for every language is found in key_value collection. foreach ($item_ids as $item_id) { $processed_data = $this->keyValue->get($keyvalue_collection) @@ -340,7 +352,36 @@ public function processItem($data) { $processed_data->checksum != $data->metadata['checksum']) { $inkeystore = $inkeystore && FALSE; } + else { + // I am keeping a single one here. Should we discern by language for chaining? + // @TODO analize what it means for us. + $processed_data_for_chaining = $processed_data; + } } + // May 2024. Allow a Processor that is to be indexed, already was processed and has data in the key store + // To use that data as input for a child one, if chained too. But only if nothing has set $io->output->plugin before + // This is needed for Processors (e.g OCR) that have already processed everything and then get a new chained + // Child that was never processed before. Would be terrible to have to re-process OCR completely just to get + // A Child to trigger. We will only provide only $io->input->plugin['searchapi'] bc that is what we know + // Any other type of child won't be able to feed from pre-existing. + if ($inkeystore && $tobechained && !$data->force && $processed_data_for_chaining!=NULL && (!isset($io->output->plugin) || !empty($io->output->plugin))) { + // Since we don't know at all what $io->output->plugin should contain + // We will pass the keystore value into $io->output->plugin and let the Processor itself (needs to have that logic) + // Deal with this use case. + $this->logger->log(LogLevel::INFO, 'Chaining @plugin on ADO Node ID @nodeid with preexisting data to the next one.', + [ + '@plugin' => $processor_instance->getPluginId(), + '@nodeid' => $data->nid, + ] + ); + if (!isset($io)) { + $io= new \stdClass(); + $io->output = new \stdClass(); + $io->output->plugin = []; + } + $io->output->plugin['searchapi'] = $processed_data_for_chaining; + } + // Allows a force in case of corrupted key value? Partial output // External/weird data? @@ -358,6 +399,7 @@ public function processItem($data) { // Check if $io->output exists? $toindex = new stdClass(); $toindex->fulltext = $io->output->searchapi['fulltext'] ?? ''; + $toindex->config_processor_id = $data->plugin_config_entity_id ?? ''; $toindex->plaintext = $io->output->searchapi['plaintext'] ?? ''; $toindex->metadata = $io->output->searchapi['metadata'] ?? []; $toindex->who = $io->output->searchapi['who'] ?? []; @@ -370,7 +412,12 @@ public function processItem($data) { $toindex->sentiment = $io->output->searchapi['sentiment'] ?? 0; $toindex->nlplang = $io->output->searchapi['nlplang'] ?? []; $toindex->processlang = $io->output->searchapi['processlang'] ?? []; - $toindex->config_processor_id = $data->plugin_config_entity_id ?? ''; + // ML ones. + $toindex->vector_384 = $io->output->searchapi['vector_384'] ?? NULL; + $toindex->vector_512 = $io->output->searchapi['vector_512'] ?? NULL; + $toindex->vector_576 = $io->output->searchapi['vector_576'] ?? NULL; + $toindex->vector_1024 = $io->output->searchapi['vector_1024'] ?? NULL; + $toindex->service_md5 = $io->output->searchapi['service_md5'] ?? ''; // $siblings will be the amount of total children processors that were // enqueued for a single Processor chain. @@ -431,12 +478,21 @@ public function processItem($data) { continue ; } $childdata = clone $data; // So we do not touch original data + //@TODO. What if we want to force a child object only? + // We could IF the Child Object depends only on searchapi. + // Requires a Change in our SBR Trigger VBO plugin + // @TODO ask Allison. We might need a VBO processor to delete, selectively Flavors from Key/Solr too. + // Only way of A) removing Bias/bad vectors/Even bad OCR> And the processor should be also be able to mark + // ap:task no ML etc + /* if ($plugin_info['plugin_definition']['id'] ?? NULL == 'ml_sentence_transformer') { + $childdata->force = TRUE; + }*/ /* @var $strawberry_runners_postprocessor_config \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity */ $postprocessor_config_entity = $plugin_info['config_entity']; - $input_property = $plugin_info['plugin_definition']['input_property']; - $input_argument = $plugin_info['plugin_definition']['input_argument']; + $input_property = $plugin_info['plugin_definition']['input_property'] ?? NULL; + $input_argument = $plugin_info['plugin_definition']['input_argument'] ?? NULL; //@TODO check if this are here and not null! - // $io->ouput will contain whatever the output is + // $io->output will contain whatever the output is // We will check if the child processor // contains a property contained in $output // If so we check if there is a single value or multiple ones @@ -445,7 +501,7 @@ public function processItem($data) { // - Can come from the original Data (most likely) // - May be overridden by the $io->output, e.g when a processor generates a file that is not part of any node $input_property_value_from_plugin = TRUE; - $input_property_value = isset($io->output->plugin) && isset($io->output->plugin[$input_property]) ? $io->output->plugin[$input_property] : NULL; + $input_property_value = $input_property && isset($io->output->plugin) && isset($io->output->plugin[$input_property]) ? $io->output->plugin[$input_property] : NULL; // If was not defined by the previous processor try from the main data. if ($input_property_value == NULL) { $input_property_value_from_plugin = FALSE; @@ -465,31 +521,44 @@ public function processItem($data) { // Warning Diego. This may lead to a null? $childdata->{$input_property} = $input_property_value; $childdata->plugin_config_entity_id = $postprocessor_config_entity->id(); - $input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? - $io->output->plugin[$input_argument] : $data->{$input_argument}; - // This is a must: Solr indexing requires a list of sequences. A single one - // will not be enqueued. - if (is_array($input_argument_value)) { - foreach ($input_argument_value as $value) { - // Here is the catch. - // Output properties may be many - // Input Properties matching always need to be one - if (!is_array($value)) { - $childdata->{$input_argument} = $value; - // The count will always be relative to this call - // Means count of how many children are being called. - $childdata->siblings = count($input_argument_value); - // In case the $input_property_value is an array coming from a plugin we may want to if has the same amount of values of $input_argument_value - // If so its many to one and we only need the corresponding entry to this sequence - if ($input_property_value_from_plugin && - is_array($input_property_value) && - count($input_property_value) == $childdata->siblings && - isset($input_property_value[$value])) { - $childdata->{$input_property} = $input_property_value[$value]; + $input_argument_value = $input_argument && isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? + $io->output->plugin[$input_argument] : ($input_argument && isset($data->{$input_argument}) ? $data->{$input_argument} : NULL); + + // May 2024, Most cases, like Pagers (PDF page extractors) $input_argument_value will be an array, a sequence + // Leading to many children. + // But for chained processors like ML ones, e.g each OCR will generate exactly ONE ML + // using the same input property of OCR. + // So we can no longer assume/not depend on $input_argument_value as we did until 0.7.0 + if ($input_argument_value) { + if (is_array($input_argument_value)) { + foreach ($input_argument_value as $value) { + // Here is the catch. + // Output properties may be many + // Input Properties matching always need to be one + if (!is_array($value)) { + $childdata->{$input_argument} = $value; + // The count will always be relative to this call + // Means count of how many children are being called. + $childdata->siblings = count($input_argument_value); + // In case the $input_property_value is an array coming from a plugin we may want to know if it has the same amount of values of $input_argument_value + // If so, it is many to one, and we only need the corresponding entry to this sequence + if ($input_property_value_from_plugin && + is_array($input_property_value) && + count($input_property_value) == $childdata->siblings && + isset($input_property_value[$value])) { + $childdata->{$input_property} = $input_property_value[$value]; + } + Drupal::queue('strawberryrunners_process_background', TRUE) + ->createItem($childdata); } - Drupal::queue('strawberryrunners_process_background', TRUE) - ->createItem($childdata); } + } elseif (!empty($input_argument_value) && $input_property_value) { + // WE Have a single one. E.g Generated by a Double chaining. For 0.8.0 we will accept this option + $childdata->{$input_argument} = $input_argument_value; + $childdata->{$input_property} = $input_property_value; + $childdata->siblings = $childdata->siblings ?? 1; + Drupal::queue('strawberryrunners_process_background', TRUE) + ->createItem($childdata); } } } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php index 92d087f..6dc2734 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php @@ -122,7 +122,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php index 9c59096..b19c7ff 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php @@ -182,7 +182,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // We use the actual file UUID to as part of the ID // e.g default_solr_index-strawberryfield_flavor_datasource/5801:1:en:1e9f687c-e29e-4c23-91ba-655d9c5cdfe6:ocr // For the general ID we will use this number when there are multiple siblings - // or 1 if the File is a single ouput + // or 1 if the File is a single output $sequence_number[] = $io->input->metadata['sequence']; } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php new file mode 100644 index 0000000..9ccee1f --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLInsightfacePostProcessor.php @@ -0,0 +1,217 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => '/image/insightface', + ] + parent::defaultConfiguration(); + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + $element = parent::settingsForm($parents, $form_state); + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source data this processor needs is fetched'), + '#required' => TRUE, + ]; + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('ML endpoint to use (fixed)'), + '#options' => [ + '/image/insightface' => 'InsightFace (Detections as MiniOCR Annotations and one embedding as a Unit Length Vector)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. This is fixed for this processor.'), + '#required' => TRUE, + ]; + // Only Images for now. + $element['jsonkey']['#options'] = [ 'as:image' => 'as:image']; + return $element; + } + + protected function runTextMLfromJSON($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + return $output; + } + + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + $config = $this->getConfiguration(); + $input_argument = $this->pluginDefinition['input_argument']; + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; + $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; + if (!($width && $height)) { + $width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL; + $height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL; + } + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget( isset($io->input->metadata['url']) ? $io->input->metadata['url'] : NULL) + ); + + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return $output; + } + /// Mobilenet does its own (via mediapipe) image scalling. So we can pass a smaller if needed. Internally + /// it uses 480 x 480 but not good to pass square bc it makes % bbox calculation harder. + // But requires us to call info.json and pre-process the sizes. + $iiif_image_url = $config['iiif_server']."/{$iiifidentifier}/full/full/0/default.jpg"; + //@TODO we are not filtering here by label yet. Next release. + $labels = []; + $page_text = NULL; + $output->plugin = NULL; + $labels = []; + $ML = $this->callImageML($iiif_image_url,$labels); + $output->searchapi['vector_512'] = isset($ML['insightface']['vector']) && is_array($ML['insightface']['vector']) && count($ML['insightface']['vector'])== 512 ? $ML['insightface']['vector'] : NULL; + if (isset($ML['insightface']['objects']) && is_array($ML['insightface']['objects']) && count($ML['insightface']['objects']) > 0 ) { + // Don't do anything if no detection. + $miniocr = $this->insightfacenetToMiniOCR($ML['insightface']['objects'], $width, $height, $sequence_number); + $output->searchapi['fulltext'] = $miniocr; + $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", + PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; + // What is a good confidence ratio here? + // based on the % of the bounding box? + // Just the value? + $labels['Face'] = 'Face'; + $output->searchapi['metadata'] = $labels; + $output->searchapi['service_md5'] = isset($ML['insightface']['modelinfo']) ? md5(json_encode($ML['insightface']['modelinfo'])) : NULL; + $output->searchapi['plaintext'] = $page_text ?? ''; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("Insightface ML Image Embeddings & Vectors") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; + } + return $output; + } + + + protected function insightfacenetToMiniOCR(array $objects, $width, $height, $pageid) { + $miniocr = new \XMLWriter(); + $miniocr->openMemory(); + $miniocr->startDocument('1.0', 'UTF-8'); + $miniocr->startElement("ocr"); + $atleastone_word = FALSE; + // To avoid divisions by 0 + $pwidth = (float) $width; + $pheight = (float) $height; + // Format here is again different. Instead of normalizing on Python we do here? + // @TODO make all methods in python act the same + // :[{"bbox":[x1,y1,x2,y2],"score":0.8881509304046631}] + // We are not using labels here. We have age, gender. Discriminatory! + // NOTE: floats are in the form of .1 so we need to remove the first 0. + $miniocr->startElement("p"); + $miniocr->writeAttribute("xml:id", 'ml_insightface_' . $pageid); + $miniocr->writeAttribute("wh", + ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); + $miniocr->startElement("b"); + foreach ($objects as $object) { + $notFirstWord = FALSE; + if ($object['bbox'] ?? FALSE) { + $miniocr->startElement("l"); + $x0 = (float)$object['bbox'][0]; + $y0 = (float)$object['bbox'][1]; + $w = (float)$object['bbox'][2] - $x0; + $h = (float)$object['bbox'][3] - $y0; + $l = ltrim(sprintf('%.3f', $x0) ?? '', 0); + $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); + $w = ltrim(sprintf('%.3f', $w) ?? '', 0); + $h = ltrim(sprintf('%.3f', $h) ?? '', 0); + $text = (string)('Face') . ' ~ ' . (string)sprintf('%.3f', $object['score'] ?? 0); + + if ($notFirstWord) { + $miniocr->text(' '); + } + $notFirstWord = TRUE; + // New OCR Highlight does not like empty tags at all + if (strlen(trim($text ?? '')) > 0) { + $miniocr->startElement("w"); + $miniocr->writeAttribute("x", + $l . ' ' . $t . ' ' . $w . ' ' . $h); + $miniocr->text($text); + // Only assume we have at least one word for tags + // Since lines? could end empty? + $atleastone_word = TRUE; + $miniocr->endElement(); + } + $miniocr->endElement(); + } + } + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endDocument(); + if ($atleastone_word) { + return $miniocr->outputMemory(TRUE); + } + else { + return StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; + } + } + + public function callImageML($image_url, $labels):mixed { + $nlpClient = $this->getNLPClient(); + $config = $this->getConfiguration(); + $arguments['iiif_image_url'] = $image_url; + //@TODO we are not filtering here by label yet. Next release. + $arguments['labels'] = $labels; + $ML = $nlpClient->get_call($config['ml_method'], $arguments, 1); + return $ML; + } + + public function callTextML($text, $query):mixed { + return FALSE; + } +} diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php new file mode 100644 index 0000000..db23bd4 --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLMobileNetPostProcessor.php @@ -0,0 +1,223 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => '/image/mobilenet', + ] + parent::defaultConfiguration(); + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + $element = parent::settingsForm($parents, $form_state); + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source data this processor needs is fetched'), + '#required' => TRUE, + ]; + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('ML endpoint to use (fixed)'), + '#options' => [ + '/image/mobilenet' => 'MobileNet (Image embeddings as a a Unit Length Vector)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. This is fixed for this processor.'), + '#required' => TRUE, + ]; + // Only Images for now. + $element['jsonkey']['#options'] = [ 'as:image' => 'as:image']; + + return $element; + } + + protected function runTextMLfromJSON($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + return $output; + // TODO: Implement runTextMLfromJSON() method. + } + + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + $config = $this->getConfiguration(); + $input_argument = $this->pluginDefinition['input_argument']; + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; + $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; + if (!($width && $height)) { + $width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL; + $height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL; + } + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget( isset($io->input->metadata['url']) ? $io->input->metadata['url'] : NULL) + ); + + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return $output; + } + /// Mobilenet does its own (via mediapipe) image scalling. So we can pass a smaller if needed. Internally + /// it uses 480 x 480 but not good to pass square bc it makes % bbox calculation harder. + // But requires us to call info.json and pre-process the sizes. + $iiif_image_url = $config['iiif_server']."/{$iiifidentifier}/full/full/0/default.jpg"; + //@TODO we are not filtering here by label yet. Next release. + $labels = []; + $page_text = NULL; + $output->plugin = NULL; + $labels = []; + $ML = $this->callImageML($iiif_image_url,$labels); + $output->searchapi['vector_1024'] = isset($ML['mobilenet']['vector']) && is_array($ML['mobilenet']['vector']) && count($ML['mobilenet']['vector'])== 1024 ? $ML['mobilenet']['vector'] : NULL; + if (isset($ML['mobilenet']['objects']) && is_array($ML['mobilenet']['objects']) && count($ML['mobilenet']['objects']) > 0 ) { + $miniocr = $this->mobilenetToMiniOCR($ML['mobilenet']['objects'], $width, $height, $sequence_number); + $output->searchapi['fulltext'] = $miniocr; + $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", + PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; + // What is a good confidence ratio here? + // based on the % of the bounding box? + // Just the value? + foreach($ML['mobilenet']['objects'] as $object) { + if (isset($category['category_name'])) { + $labels[$category['category_name']] = $category['category_name']; + } + } + } + $output->searchapi['metadata'] = $labels; + $output->searchapi['service_md5'] = isset($ML['mobilenet']['modelinfo']) ? md5(json_encode($ML['mobilenet']['modelinfo'])) : NULL; + $output->searchapi['plaintext'] = $page_text ?? ''; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("MobileNet ML Image Embeddings & Vectors") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; + return $output; + } + + + protected function mobilenetToMiniOCR(array $objects, $width, $height, $pageid) { + $miniocr = new \XMLWriter(); + $miniocr->openMemory(); + $miniocr->startDocument('1.0', 'UTF-8'); + $miniocr->startElement("ocr"); + $atleastone_word = FALSE; + // To avoid divisions by 0 + $pwidth = (float) $width; + $pheight = (float) $height; + // Format here is different. Instead of normalizing on Python we do here? + // @TODO make all methods in python act the same + // :[{"bounding_box":{"height":0.9609375,"origin_x":0.0,"origin_y":0.0453125,"width":1.0},"categories":[{"category_name":"person","display_name":null,"index":null,"score":0.8881509304046631}] + // NOTE: floats are in the form of .1 so we need to remove the first 0. + $miniocr->startElement("p"); + $miniocr->writeAttribute("xml:id", 'ml_mobilenet_' . $pageid); + $miniocr->writeAttribute("wh", + ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); + $miniocr->startElement("b"); + foreach ($objects as $object) { + $notFirstWord = FALSE; + if ($object['bounding_box'] ?? FALSE) { + $miniocr->startElement("l"); + $x0 = (float)$object['bounding_box']['origin_x']; + $y0 = (float)$object['bounding_box']['origin_y']; + $w = (float)$object['bounding_box']['width']; + $h = (float)$object['bounding_box']['height']; + $l = ltrim(sprintf('%.3f', $x0) ?? '', 0); + $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); + $w = ltrim(sprintf('%.3f', $w) ?? '', 0); + $h = ltrim(sprintf('%.3f', $h) ?? '', 0); + $text = ''; + foreach ($object['categories'] as $category) { + $text .= (string)($category['category_name'] ?? 'Unlabeled') . ' ~ ' . (string)sprintf('%.3f', $category['score'] ?? 0); + } + if ($notFirstWord) { + $miniocr->text(' '); + } + $notFirstWord = TRUE; + // New OCR Highlight does not like empty tags at all + if (strlen(trim($text ?? '')) > 0) { + $miniocr->startElement("w"); + $miniocr->writeAttribute("x", + $l . ' ' . $t . ' ' . $w . ' ' . $h); + $miniocr->text($text); + // Only assume we have at least one word for tags + // Since lines? could end empty? + $atleastone_word = TRUE; + $miniocr->endElement(); + } + $miniocr->endElement(); + } + } + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endDocument(); + if ($atleastone_word) { + return $miniocr->outputMemory(TRUE); + } + else { + return StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; + } + } + + public function callImageML($image_url, $labels):mixed { + $nlpClient = $this->getNLPClient(); + $config = $this->getConfiguration(); + $arguments['iiif_image_url'] = $image_url; + //@TODO we are not filtering here by label yet. Next release. + $arguments['labels'] = $labels; + $ML = $nlpClient->get_call($config['ml_method'], $arguments, 1); + return $ML; + } + + public function callTextML($text, $query):mixed { + return FALSE; + } +} diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php new file mode 100644 index 0000000..d6775a3 --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLSentenceTransformertPostProcessor.php @@ -0,0 +1,158 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => '/text/sentence_transformer', + ] + parent::defaultConfiguration(); + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + $element = parent::settingsForm($parents, $form_state); + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'json' => 'JSON passed by a parent Processor.This processor needs to be chained to another one that generates Text. e.g OCR.', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source file this processor needs is fetched'), + '#required' => TRUE, + ]; + $element['output_type'] = [ + '#type' => 'select', + '#title' => $this->t('The expected and desired output of this processor.'), + '#options' => [ + 'json' => 'Data/Values that can be serialized to JSON', + ], + '#default_value' => $this->getConfiguration()['output_type'], + '#description' => $this->t('If the output is just data and "One or more Files" is selected all data will be dumped into a file and handled as such.'), + ]; + $element['output_destination'] = [ + '#type' => 'checkboxes', + '#title' => $this->t("Where and how the output will be used."), + '#options' => [ + 'plugin' => 'As Input for another processor Plugin', + 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', + ], + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output. This plugin always generates also search API output data.'), + '#required' => TRUE, + ]; + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('ML endpoint to use (fixed)'), + '#options' => [ + '/text/sentence_transformer' => 'SBert Sentence Transformer (text embeddings as a Unit Length Vector)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. This is fixed for this processor.'), + '#required' => TRUE, + ]; + + return $element; + } + + protected function runTextMLfromJSON($io, NlpClient $nlpClient): \stdClass + { + $output = new \stdClass(); + $config = $this->getConfiguration(); + + $input_argument = $this->pluginDefinition['input_argument']; + $input_property = $this->pluginDefinition['input_property']; + + $file_languages = isset($io->input->lang) ? (array)$io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + $sequence_number = isset($io->input->{$input_argument}) ? (int)$io->input->{$input_argument} : 1; + + setlocale(LC_CTYPE, 'en_US.UTF-8'); + if (isset($io->input->{$input_property})) { + $page_text = $io->input->{$input_property}->plaintext ?? NULL; + if ($page_text) { + $labels = []; + $output->plugin = NULL; + $labels = []; + $ML = $this->callTextML($page_text, false); + $output->searchapi['vector_384'] = isset($ML['sentence_transformer']['vector']) && is_array($ML['sentence_transformer']['vector']) && count($ML['sentence_transformer']['vector']) == 384 ? $ML['sentence_transformer']['vector'] : NULL; + $output->searchapi['metadata'] = $io->input->{$input_property}->metadata ?? []; + $output->searchapi['service_md5'] = isset($ML['mobilenet']['modelinfo']) ? md5(json_encode($ML['mobilenet']['modelinfo'])) : NULL; + $output->searchapi['plaintext'] = $page_text ?? ''; + $output->searchapi['fulltext'] = $io->input->{$input_property}->fulltext ?? []; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("Sentence Transformer ML Text Embeddings & Vectors") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; + } + } + return $output; + } + + public function callImageML($image_url, $labels):mixed { + return FALSE; + } + + public function callTextML($text, $query = TRUE):mixed { + $nlpClient = $this->getNLPClient(); + $config = $this->getConfiguration(); + $arguments['text'] = $text; + if ($query) { + $arguments['query'] = TRUE; + } + //@TODO we are not filtering here by label yet. Next release. + $ML = $nlpClient->get_call($config['ml_method'], $arguments, 1); + return $ML; + } + + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass + { + $output = new \stdClass(); + return $output; + } + +} diff --git a/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php new file mode 100644 index 0000000..cf89ee9 --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/MLYoloPostProcessor.php @@ -0,0 +1,213 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => '/image/yolov8', + ] + parent::defaultConfiguration(); + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + $element = parent::settingsForm($parents, $form_state); + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source data this processor needs is fetched'), + '#required' => TRUE, + ]; + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('ML endpoint to use (fixed)'), + '#options' => [ + '/image/yolo' => 'YOLO (Image Object detection (as MiniOCR Annotations) & embedding as a Unit Length Vector)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. This is fixed for this processor.'), + '#required' => TRUE, + ]; + // Only Images for now. + $element['jsonkey']['#options'] = [ 'as:image' => 'as:image']; + return $element; + } + + protected function runTextMLfromJSON($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + return $output; + // TODO: Implement runTextMLfromJSON() method. + } + + protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass { + $output = new \stdClass(); + $config = $this->getConfiguration(); + $input_argument = $this->pluginDefinition['input_argument']; + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; + // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} + $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; + $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; + if (!($width && $height)) { + $width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL; + $height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL; + } + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget( isset($io->input->metadata['url']) ? $io->input->metadata['url'] : NULL) + ); + + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return $output; + } + //@TODO we know yolov8 takes 640px. We can pass just that to make it faster. + // But requires us to call info.json and pre-process the sizes. + $iiif_image_url = $config['iiif_server']."/{$iiifidentifier}/full/full/0/default.jpg"; + //@TODO we are not filtering here by label yet. Next release. + $labels = []; + $page_text = NULL; + $output->plugin = NULL; + $labels = []; + $ML = $this->callImageML($iiif_image_url,$labels); + $output->searchapi['vector_576'] = isset($ML['yolo']['vector']) && is_array($ML['yolo']['vector']) && count($ML['yolo']['vector'])== 576 ? $ML['yolo']['vector'] : NULL; + if (isset($ML['yolo']['objects']) && is_array($ML['yolo']['objects']) && count($ML['yolo']['objects']) > 0 ) { + $miniocr = $this->yoloToMiniOCR($ML['yolo']['objects'], $width, $height, $sequence_number); + $output->searchapi['fulltext'] = $miniocr; + $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", + PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; + // What is a good confidence ratio here? + // based on the % of the bounding box? + // Just the value? + foreach($ML['yolo']['objects'] as $object) { + $labels[$object['name']] = $object['name']; + } + } + $output->searchapi['metadata'] = $labels; + $output->searchapi['service_md5'] = isset($ML['yolo']['modelinfo']) ? md5(json_encode($ML['yolo']['modelinfo'])) : NULL; + $output->searchapi['plaintext'] = $page_text ?? ''; + $output->searchapi['processlang'] = $file_languages; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("ML Image Embeddings & Vectors") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; + return $output; + } + + + protected function yoloToMiniOCR(array $objects, $width, $height, $pageid) { + $miniocr = new \XMLWriter(); + $miniocr->openMemory(); + $miniocr->startDocument('1.0', 'UTF-8'); + $miniocr->startElement("ocr"); + $atleastone_word = FALSE; + // To avoid divisions by 0 + $pwidth = (float) $width; + $pheight = (float) $height; + // NOTE: floats are in the form of .1 so we need to remove the first 0. + $miniocr->startElement("p"); + $miniocr->writeAttribute("xml:id", 'ml_yolo_' . $pageid); + $miniocr->writeAttribute("wh", + ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); + $miniocr->startElement("b"); + foreach ($objects as $object) { + $notFirstWord = FALSE; + $miniocr->startElement("l"); + $x0 = (float) $object['box']['x1']; + $y0 = (float) $object['box']['y1']; + $x1 = (float) $object['box']['x2']; + $y1 = (float) $object['box']['y2']; + $l = ltrim(sprintf('%.3f', $x0) ?? '', 0); + $t = ltrim(sprintf('%.3f', $y0) ?? '', 0); + $w = ltrim(sprintf('%.3f', ($x1 - $x0)) ?? '', 0); + $h = ltrim(sprintf('%.3f', ($y1 - $y0)) ?? '', 0); + $text = (string) ($object['name'] ?? 'Unlabeled') .' ~ '. (string) ("{$object['confidence']}" ?? "0"); + if ($notFirstWord) { + $miniocr->text(' '); + } + $notFirstWord = TRUE; + // New OCR Highlight does not like empty tags at all + if (strlen(trim($text ?? '')) > 0) { + $miniocr->startElement("w"); + $miniocr->writeAttribute("x", + $l . ' ' . $t . ' ' . $w . ' ' . $h); + $miniocr->text($text); + // Only assume we have at least one word for tags + // Since lines? could end empty? + $atleastone_word = TRUE; + $miniocr->endElement(); + } + $miniocr->endElement(); + } + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endElement(); + $miniocr->endDocument(); + if ($atleastone_word) { + return $miniocr->outputMemory(TRUE); + } + else { + return StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; + } + } + + public function callImageML($image_url, $labels):mixed { + $nlpClient = $this->getNLPClient(); + $config = $this->getConfiguration(); + $arguments['iiif_image_url'] = $image_url; + //@TODO we are not filtering here by label yet. Next release. + $arguments['labels'] = $labels; + $ML = $nlpClient->get_call($config['ml_method'], $arguments, 1); + return $ML; + } + + public function callTextML($text, $query):mixed { + return FALSE; + } + +} diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index cda3298..e393d94 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -10,7 +10,6 @@ use Drupal\Core\Cache\CacheBackendInterface; use Drupal\Core\Form\FormStateInterface; -use Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor; use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface; use Drupal\strawberryfield\Plugin\search_api\datasource\StrawberryfieldFlavorDatasource; use Drupal\strawberry_runners\Web64\Nlp\NlpClient; @@ -206,7 +205,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output. This plugin always generates also search API output data.'), '#required' => TRUE, ]; @@ -301,12 +300,12 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $input_argument = $this->pluginDefinition['input_argument']; $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; - $config = $this->getConfiguration(); $timeout = $config['timeout']; // in seconds $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { $output = new \stdClass(); + $output->plugin = NULL; // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; setlocale(LC_CTYPE, 'en_US.UTF-8'); @@ -340,12 +339,14 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug ]); } $output->searchapi['fulltext'] = $miniocr; - $output->plugin = $miniocr; + // This is temporary. $io->output = $output; } } - //if not searchable run try to load the ADO, check if there is a as:text HOCR with the same size + //if not searchable try to load the ADO, check if there is an as:text HOCR with the same size //as the current Image and try to process, if not, run, tesseract + // @TODO. Ask Allison. If PDFAlto worked out, do we still need to check if there is an attached HOCR? + // Or does an attached HOCR always wins over PDFtoAlto? $width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL; $height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL; // In case identify failed, we can try with flv:exif (e.g JP2s might not pass the identify test) @@ -433,7 +434,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug ]); } $output->searchapi['fulltext'] = $miniocr; - $output->plugin = $miniocr; + $io->output = $output; $external_found = TRUE; } @@ -449,9 +450,9 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } } } - // At this stage only run Tesseract if we are still without $output->plugin + // At this stage only run Tesseract if we are still without $output->searchapi['fulltext'] - if (!isset($output->plugin) || $output->plugin == NULL) { + if (!isset($output->searchapi['fulltext']) || $output->searchapi['fulltext'] == NULL) { setlocale(LC_CTYPE, 'en_US.UTF-8'); $execstring = $this->buildExecutableCommand($io); if ($execstring) { @@ -483,11 +484,10 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug ]); } $output->searchapi['fulltext'] = $miniocr; - $output->plugin = $miniocr; } } - if (!isset($output->plugin) || $output->plugin == NULL) { + if (!isset($output->searchapi['fulltext']) || $output->searchapi['fulltext'] == NULL) { // If we still have no OCR at this state it is time to bail out $this->logger->warning("@sbr_processor: HOCR to miniOCR processing from Tesseract failed for ADO with UUID @node_uuid and File with UUID @file_uuid with sequence number @sequence_id", [ @@ -605,6 +605,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $output->searchapi['processlang'] = $file_languages; $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("Sequence") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; $io->output = $output; } else { @@ -696,7 +697,6 @@ public function buildExecutableCommand(\stdClass $io) { // Only return $command if it contains the original filepath somewhere if (strpos($command, $file_path) !== FALSE) { return $command; - error_log($command); } return NULL; } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php index 3161e6b..672a20a 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SubtitlePostProcessor.php @@ -142,7 +142,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; @@ -244,6 +244,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; if ($file_path && $file_uuid && $node_uuid) { $output = new \stdClass(); + $output->plugin = NULL; // Let's see if we need an output path or not $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; $out_file_path = NULL; @@ -266,7 +267,6 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $output->searchapi['fulltext'] = $miniocr ?? StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; - $output->plugin = $text_content; $output->searchapi['plaintext'] = $page_text; } else { @@ -413,6 +413,7 @@ function ($languages_enabled) { $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("Sequence") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; $io->output = $output; } else { diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php index 94bfc90..2d5aadb 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php @@ -142,7 +142,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php index 4455bc5..47af9c8 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php @@ -141,7 +141,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; @@ -286,7 +286,6 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } $output->searchapi['fulltext'] = StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; - $output->plugin = $text_content; $output->searchapi['plaintext'] = $page_text; } else { @@ -433,6 +432,7 @@ function ($languages_enabled) { $output->searchapi['ts'] = date("c"); $output->searchapi['label'] = $this->t("Sequence") . ' ' . $sequence_number; + $output->plugin['searchapi'] = $output->searchapi; $io->output = $output; } else { diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php index 68bfb76..214af14 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php @@ -112,7 +112,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'plugin' => 'As Input for another processor Plugin', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination']))? $this->getConfiguration()['output_destination']: [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), '#required' => TRUE, ]; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php index aebd46a..513b7ed 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php @@ -84,7 +84,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], - '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output. This plugin always generates also search API output data.'), '#required' => TRUE, ]; @@ -291,7 +291,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $output->searchapi['processlang'] = []; $output->searchapi['label'] = $page_title; $output->searchapi['ts'] = $page_ts; - $output->plugin = $output->searchapi; + $output->plugin['searchapi'] = $output->searchapi; } else { throw new \Exception("WebPage Text was not a valid JSON."); diff --git a/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php new file mode 100644 index 0000000..733e6dd --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/abstractMLPostProcessor.php @@ -0,0 +1,330 @@ + 'asstructure', + 'mime_type' => ['image/jpeg'], + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'language_key' => 'language_iso639_3', + 'language_default' => 'eng', + 'timeout' => 300, + 'nlp_url' => 'http://esmero-nlp:6400', + 'ml_method' => NULL, + 'iiif_server' => '', + ] + parent::defaultConfiguration(); + } + + public const ML_IMAGE_VECTOR_SIZE = [ + '/image/yolo' => 576, + '/image/mobilenet' => 1024, + '/image/insightface' => 512, + ]; + + public const ML_TEXT_VECTOR_SIZE = [ + '/text/sentence_transformer' => 384, + ]; + + protected $nlp_client = null; + + public function calculateDependencies() { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + // @TODO: Implement calculateDependencies() method. + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + 'ado' => 'ADO Strawberryfield JSON', + 'json' => 'JSON provided by another Processor' + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source data this processor needs is fetched'), + '#required' => TRUE, + ]; + + $element['ado_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('ADO type(s) to limit this processor to.'), + '#default_value' => $this->getConfiguration()['ado_type'], + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), + ]; + + $element['jsonkey'] = [ + '#type' => 'checkboxes', + '#title' => $this->t('The JSON key that contains the desired source.'), + '#options' => [ + 'as:image' => 'as:image', + 'as:document' => 'as:document', + 'as:audio' => 'as:audio', + 'as:video' => 'as:video', + 'as:text' => 'as:text', + 'as:application' => 'as:application', + ], + '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [], + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'], + ], + ], + ]; + + $element['jmespath'] = [ + '#type' => 'textfield', + '#title' => $this->t('Jmespath used to fetch/prefilter the metadata passed as JSON to the processor'), + '#default_value' => (!empty($this->getConfiguration()['jmespath']) && is_array($this->getConfiguration()['jmespath'])) ? $this->getConfiguration()['jmespath'] : [], + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[source_type]"]' => ['value' => 'ado'], + ], + ], + ]; + + $element['mime_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('Mimetypes(s) to limit this Processor to.'), + '#default_value' => $this->getConfiguration()['mime_type'], + '#description' => $this->t('A single Mimetype type or a comma separated list of mimetypes that qualify to be Processed. Leave empty to apply any file'), + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'], + ], + ], + ]; + + $element['language_key'] = [ + '#type' => 'textfield', + '#title' => $this->t("Within the ADO's metadata, the JSON key that contains the language in ISO639-3 (3 letter)"), + '#default_value' => (!empty($this->getConfiguration()['language_key'])) ? $this->getConfiguration()['language_key'] : '', + '#required' => TRUE, + ]; + + $element['language_default'] = [ + '#type' => 'textfield', + '#title' => $this->t("Please provide a default language in ISO639-3 (3 letter) format. If none is provided we will use 'eng' "), + '#default_value' => (!empty($this->getConfiguration()['language_default'])) ? $this->getConfiguration()['language_default'] : 'eng', + '#required' => TRUE, + ]; + + + $element['output_type'] = [ + '#type' => 'select', + '#title' => $this->t('The expected and desired output of this processor.'), + '#options' => [ + 'json' => 'Data/Values that can be serialized to JSON', + ], + '#default_value' => $this->getConfiguration()['output_type'], + '#description' => $this->t('ML processors only generate JSON'), + ]; + + $element['output_destination'] = [ + '#type' => 'checkboxes', + '#title' => $this->t("Where and how the output will be used."), + '#options' => [ + 'plugin' => 'As Input for another processor Plugin', + 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for ML Vector Comparison)', + ], + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this output.'), + '#required' => TRUE, + ]; + + $element['processor_queue_type'] = [ + '#type' => 'select', + '#title' => $this->t('The queue to use for this processor.'), + '#options' => [ + 'background' => 'Secondary queue in background', + 'realtime' => 'Primary queue in realtime', + ], + '#default_value' => $this->getConfiguration()['processor_queue_type'], + '#description' => $this->t('The primary queue will be execute in realtime while the Secondary will be execute in background'), + '#required' => TRUE, + ]; + + $element['nlp_url'] = [ + '#type' => 'url', + '#title' => $this->t("The URL location of your NLP64/ML server."), + '#default_value' => $this->getConfiguration()['nlp_url'] ?? 'http://esmero-nlp:6400', + '#description' => t('Defaults to http://esmero-nlp:6400'), + '#required' => TRUE, + ]; + + $element['ml_method'] = [ + '#type' => 'radios', + '#title' => $this->t('Which ML endpoint to use'), + '#options' => [ + '/image/yolo' => 'YOLO (Image Object detection (as MiniOCR Annotations) & embedding as a Unit Length Vector)', + '/image/mobilenet' => 'MobileNet (Image embeddings as a a Unit Length Vector)', + '/text/sentence_transformer' => 'SBert Sentence Transformer (text embeddings as a Unit Length Vector)', + '/image/insightface' => 'InsightFace (Detection as MiniOCR Annotations and embedding as a Unit Length Vector)', + ], + '#default_value' => $this->getConfiguration()['ml_method'], + '#description' => $this->t('The ML endpoint/Model. Depending on the choice the actual value/size of data ingested will vary.'), + '#required' => TRUE, + ]; + + $element['iiif_server'] = [ + '#type' => 'url', + '#title' => $this->t('The IIIF Server to use for Image ML'), + '#default_value' => $this->getConfiguration()['iiif_server'] ?: \Drupal::service('config.factory') + ->get('format_strawberryfield.iiif_settings') + ->get('int_server_url'), + '#description' => $this->t('The IIIF Server to use. By default we will use the Internal (esmero-cantaloupe) endpoint'), + '#required' => TRUE, + ]; + + $element['timeout'] = [ + '#type' => 'number', + '#title' => $this->t('Timeout in seconds for this process.'), + '#default_value' => $this->getConfiguration()['timeout'], + '#description' => $this->t('If the process runs out of time it can still be processed again.'), + '#size' => 4, + '#maxlength' => 4, + '#min' => 1, + ]; + $element['weight'] = [ + '#type' => 'number', + '#title' => $this->t('Order or execution in the global chain.'), + '#default_value' => $this->getConfiguration()['weight'], + ]; + return $element; + } + + + + public function onDependencyRemoval(array $dependencies) { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + return parent::onDependencyRemoval( + $dependencies + ); // TODO: Change the autogenerated stub + } + + /** + * Executes the logic of this plugin given a file path and a context. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * @param string $context + */ + public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) { + $input_property = $this->pluginDefinition['input_property']; + $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; + $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; + + $config = $this->getConfiguration(); + $timeout = $config['timeout']; // in seconds + $output = new \stdClass(); + + if (!empty($config['nlp_url']) && !empty($config['ml_method'])) { + $nlp = $this->getNLPClient(); + if ($nlp) { + $capabilities = $nlp->get_call('/status', NULL); + $languages_enabled = []; + $detected_lang = NULL; + //@TODO Should cache this too. Or deprecate ::language for 0.5.0 + if ($capabilities + && is_array($capabilities) + && is_array($capabilities['web64']['endpoints']) + && in_array($config['ml_method'], $capabilities['web64']['endpoints'])) { + if (in_array($config['source_type'], ['asstructure']) && isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { + $mloutput = $this->runImageMLfromIIIF($io, $nlp); + $io->output = $mloutput ?? $output; + } + elseif (in_array($config['source_type'], ['ado', 'json']) && $node_uuid) { + $mloutput = $this->runTextMLfromJSON($io, $nlp); + $io->output = $mloutput ?? $output; + } + else { + throw new \Exception("Invalid argument(s) for ML processor"); + } + } + else { + throw new \Exception("Your NLP/ML endpoint does not provide ". $config['ml_method'] . ' capabilities'); + } + } + else { + throw new \Exception("NLP/ML endpoint did not respond"); + } + } + else { + throw new \Exception("Missing ML Configuration(s) for ML processor"); + } + } + + abstract protected function runImageMLfromIIIF($io, NlpClient $nlpClient): \stdClass; + + abstract protected function runTextMLfromJSON($io, NlpClient $nlpClient) :\stdClass; + + // Mime types supported as input to Tesseract. + // See https://github.com/tesseract-ocr/tessdoc/blob/main/InputFormats.md + public function isImageMLMimeType($mime_type): bool { + $image_ML_mime_types = [ + 'image/png', + 'image/jpeg', + 'image/tiff', + 'image/jp2', + 'application/pdf', + ]; + return in_array($mime_type, $image_ML_mime_types); + } + + public function getVectorMLInfo() { + $config = $this->getConfiguration(); + $info = [ + 'nlp_url' => $config['nlp_url'], + 'ml_method' => $config['ml_method'], + 'iiif_server' => $config['iiif_server'], + ]; + } + + abstract public function callImageML($image_url, $labels):mixed; + abstract public function callTextML($text, $query):mixed; + + protected function getNLPClient() { + if ($this->nlp_client) { + return $this->nlp_client; + } + else { + $config = $this->getConfiguration(); + $nlp = new NlpClient($config['nlp_url']); + $this->nlp_client = $nlp; + return $this->nlp_client; + } + } + + + +} diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginManager.php b/src/Plugin/StrawberryRunnersPostProcessorPluginManager.php index 606247e..a453c97 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginManager.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginManager.php @@ -39,5 +39,4 @@ public function __construct( $this->setCacheBackend($cache_backend,'strawberry_runners_strawberryrunnerspostprocessor_plugins'); } - -} \ No newline at end of file +} diff --git a/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php new file mode 100644 index 0000000..32727e9 --- /dev/null +++ b/src/Plugin/views/argument/StrawberryRunnersMLImageArgument.php @@ -0,0 +1,542 @@ +setSbrEntityStorage( + $container->get('entity_type.manager')->getStorage('strawberry_runners_postprocessor') + ); + $plugin->setFileEntityStorage( + $container->get('entity_type.manager')->getStorage('file') + ); + $plugin->setFieldsHelper($container->get('search_api.fields_helper')); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setCache($container->get('cache.default')); + $plugin->currentUser = $container->get('current_user'); + $plugin->strawberryRunnerUtilityService = $container->get( + 'strawberry_runner.utility' + ); + $plugin->strawberryRunnerProcessorPluginManager = $container->get( + 'strawberry_runner.processor_manager' + ); + return $plugin; + } + + + /** + * {@inheritdoc} + */ + public function defineOptions() { + $options = parent::defineOptions(); + $options['value']['default'] = []; + $options['sbf_fields'] = ['default' => NULL]; + $options['pre_query'] = ['default' => TRUE]; + $options['pre_query_facets'] = ['default' => TRUE]; + $options['topk'] = ['default' => 3]; + $options['ml_strawberry_postprocessor'] = ['default' => NULL]; + return $options; + } + + public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage) + { + $this->sbrEntityStorage = $sbrEntityStorage; + return $this; + } + + public function setFileEntityStorage(EntityStorageInterface $fileEntityStorage) + { + $this->fileEntityStorage = $fileEntityStorage; + return $this; + } + + + protected function valueSubmit($form, FormStateInterface $form_state) { + $form_state = $form_state; + } + + public function setFieldsHelper(FieldsHelperInterface $fieldsHelper) { + $this->fieldsHelper = $fieldsHelper; + return $this; + } + + /** + * Sets the View Storage. + * + * @param \Drupal\Core\Entity\EntityStorageInterface $viewstorage + * The view Storage. + * + * @return $this + */ + public function setViewStorage(EntityStorageInterface $viewstorage) { + $this->viewStorage = $viewstorage; + return $this; + } + + /** + * Sets the Cache Backed. + * + * @param \Drupal\Core\Cache\CacheBackendInterface $cache + * The cache backend. Use to store complex calculations of property paths. + * + * @return $this + */ + public function setCache(CacheBackendInterface $cache) { + $this->cache = $cache; + return $this; + } + + public function showOperatorForm(&$form, FormStateInterface $form_state) { + } + + /** + * {@inheritdoc} + */ + public function buildOptionsForm(&$form, FormStateInterface $form_state) { + parent::buildOptionsForm($form, $form_state); + $active_plugins = $this->strawberryRunnerUtilityService->getActivePluginConfigs(FALSE); + + foreach ($active_plugins as $by_source => $plugins) { + foreach ($plugins as $entity_id => $active_plugin) { + if (isset($active_plugin['ml_method'])) { + $post_processor_options[$entity_id] = $active_plugin['ml_method'] ."({$entity_id})"; + } + } + } + $fields = $this->getSbfDenseVectorFields() ?? []; + $form['sbf_fields'] = [ + '#type' => 'select', + '#title' => $this->t( + 'KNN Dense Vector Field to query against' + ), + '#description' => $this->t( + 'Select the fields that will be used to query against.' + ), + '#options' => $fields, + '#multiple' => FALSE, + '#default_value' => $this->options['sbf_fields'], + '#required' => TRUE, + ]; + $form['pre_query'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query'], + '#title' => $this->t('Treat previous filters to this as pre queries (Future Feature)'), + '#description'=> $this->t( + 'If any other filter setup before this one will be treated as pre-queries to the actual KNN query.' + ), + '#disabled' => TRUE, + ]; + $form['pre_query_facets'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query_facets'], + '#title' => $this->t('Treat also facets, if any, as pre queries (Future Feature)'), + '#description'=> $this->t( + 'If any other facets will be treated as pre-queries to the actual KNN query.' + ), + '#disabled' => TRUE, + ]; + $form['topk'] = [ + '#type' => 'number', + '#default_value' => $this->options['topk'], + '#title' => $this->t('Top Similarity KNN hits to request to the backend.'), + '#description'=> $this->t( + 'The more, the slower' + ), + '#min' => 1, + '#max' => 100, + ]; + $form['ml_strawberry_postprocessor'] = [ + '#type' => 'select', + '#title' => $this->t( + 'Strawberry Runners processor to extract the on-the fly embedding' + ), + '#description' => $this->t( + 'Select the ML Strawberry Runners Processor that was used to index Vectors into the field you are going to search against. These need to match' + ), + '#options' => $post_processor_options, + '#multiple' => FALSE, + '#default_value' => $this->options['ml_strawberry_postprocessor'], + '#required' => TRUE, + ]; + } + /** + * Validate the options form. + */ + public function validateOptionsForm(&$form, FormStateInterface $form_state) { + // We need to validate that the selected field is of the same source/size as model that will + // be used to generate the on the fly vectors. + // So we need to load the SBR entity passed, compare the model against the constant present in + // \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE + // and then load the field and see if the source (is of the same SBFlavor property/size (vector_576, etc) + $options = $form_state->getValue('options'); + $processor_id = $options['ml_strawberry_postprocessor'] ?? NULL; + $field_id = $options['sbf_fields']; + if ($processor_id) { + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Note, we could also restrict to the same image mimetypes that the processor is setup to handle? + if (isset($sbr_config['ml_method'])) { + $vector_size = abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']] ?? ''; + $field_info = $this->getSbfDenseVectorFieldSource($field_id); + if ($field_info) { + // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. + $propath_pieces = explode('/', $field_info->getCombinedPropertyPath()); + if (!(end($propath_pieces) == 'vector_' .$vector_size && $field_info->getType() == 'densevector_' . $vector_size)) { + $form_state->setErrorByName('options][ml_strawberry_postprocessor', $this->t('The Field/Processor combination is not right. Make sure your Configured KNN Dense Vector Field and the Strawberry Processor are targeting the same Vector Dimensions (e.g first one is from a vector_576 data source property and the field type is densevector_576 and the processor is calling YOLO)')); + } + } + else { + // The field is gone. + $form_state->setErrorByName('options][sbf_fields', $this->t('CConfigured KNN Dense Vector Field does not longer exists. Please replace your config with a valid/indexed field.')); + } + } + } + } + } + + public function submitOptionsForm(&$form, FormStateInterface $form_state) { + parent::submitOptionsForm( + $form, $form_state + ); + } + + + /** + * Set the input for this argument. + * + * @return TRUE if it successfully validates; FALSE if it does not. + */ + public function setArgument($arg) { + $this->argument = $arg; + return $this->validateArgument($arg); + } + + + public function query($group_by = FALSE) { + // if the User has not this permission simply return as nothing was sent. + if ($this->currentUser->isAnonymous() || (!$this->currentUser->hasPermission('execute Image ML queries') && !$this->currentUser->hasRole('administrator'))) { + return; + } + $this->argument_validated; + if (empty($this->expanded_argument) || ! $this->query) { + // basically not validated, not present as a value and also someone cancelled/nuklled the query before? + return; + } + // Just to be sure here bc we have our own way. Who knows if some external code decides to alter the value + $this->value = $this->expanded_argument; + // We should only be at this stage if we have validation + // As always, start by processing all inline, then move to separate code for cleaner methods + // We need to load the SBR entity first here + $iiif_image_url = null; + $processor_id = $this->options['ml_strawberry_postprocessor']; + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Now we need to actually generate an instance of the runner using the config + $entity_id = $plugin_config_entity->id(); + $configuration_options = $plugin_config_entity->getPluginconfig(); + $configuration_options['configEntity'] = $entity_id; + /* @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface $plugin_instance */ + $plugin_instance + = $this->strawberryRunnerProcessorPluginManager->createInstance( + $plugin_config_entity->getPluginid(), + $configuration_options + ); + if ($plugin_instance instanceof abstractMLPostProcessor) { + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget($this->value->iiif_image_id) ?? NULL + ); + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return; + } + // basically the whole image if no bbox will be used as default + // Now prep the image for fetching. First pass, just an ID, then deal with the UUID for the file option + $region = 'full'; + if (isset($this->value->bbox->x)) { + $region = 'pct:'.($this->value->bbox->x).','.($this->value->bbox->y).','.($this->value->bbox->w).','.($this->value->bbox->h); + } + $iiif_image_url = $sbr_config['iiif_server']."/{$iiifidentifier}/{$region}/!640,640/0/default.jpg"; + try { + $response = $plugin_instance->callImageML($iiif_image_url, []); + } + catch (\Exception $exception) { + // Give user feedback + return; + } + if (isset($response['message'])) { + // Now here is an issue. Each endpoint will return the vector inside a yolo/etc. + // We should change that and make it generic (requires new pythong code/rebuilding NLP container) + // so for now i will use the ml method config split/last to get the right key. + foreach (["error","message","web64"] as $remove) { + unset($response[$remove]); + } + $all_knns = $this->query->getOption('sbf_knn') ?? []; + foreach ($response as $endpoint_key => $values) { + if (isset($values['vector']) && is_array($values['vector']) && count($values['vector']) == abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']]) { + $all_knns[] = $this->buildKNNQuery($this->query, $values['vector']); + } + } + array_filter($all_knns); + if (count($all_knns)) { + $this->query->setOption('sbf_knn', $all_knns); + } + } + } + } + if (!$iiif_image_url) { + return; + } + return; + } + + public function validateArgument($arg) { + + $this->expanded_argument = NULL; + + // By using % in URLs, arguments could be validated twice; this eases + // that pain. + if (isset($this->argument_validated)) { + return $this->argument_validated; + } + + if ($this->isException($arg)) { + return $this->argument_validated = TRUE; + } + + $plugin = $this->getPlugin('argument_validator'); + //return $this->argument_validated = $plugin->validateArgument($arg); + if ($arg && $this->is_base64(urldecode($arg))) { + // Because of actual implementation (JS to PHP) details changes are this will come from a JS encoded gzip that needs to be unpacked + // to try that first. On JS using pako with gzip is the ideal way. + // if unpacked it will be actuall an string encoded array (utf8, just numbers) + $arg = urldecode(base64_decode(urldecode($arg))); + $decoded = NULL; + $unpacked_deflated = explode(",", $arg); + if (count($unpacked_deflated) > 2) { + try { + $decoded = gzdecode(pack("c*",...$unpacked_deflated)); + } + catch (\Exception $e) { + // Ok was not that so we try another method + } + } + if (!$decoded) { + $decoded = gzuncompress($arg); + + + } + if ($decoded) { + $decoded_object = json_decode($decoded); + if ($decoded_object) { + if (!empty($decoded_object->fileuuid ?? NULL) && + !empty($decoded_object->nodeuuid ?? NULL) && + !empty($decoded_object->fragment ?? NULL)) { + $files = $this->fileEntityStorage->loadByProperties(['uuid' => $decoded_object->fileuuid]); + //@TODO for security. Check if the file is attached to the node too. + $file = reset($files); + /* @var File $file */ + if ($file) { + $this->expanded_argument = new \stdClass; + $this->expanded_argument->iiif_image_id = $file->getFileUri(); + $fragment_pieces = explode("xywh=percent:",$decoded_object->fragment); + if (count($fragment_pieces) == 2) { + $xywh = explode(",", $fragment_pieces[1]); + if (count($xywh) == 4) { + // we got them all + $this->expanded_argument->bbox = (object) array_combine(['x','y','w','h'], $xywh); + $this->argument_validated = TRUE; + } + } + } + } + } + /* const image_data = { + "fileuuid": groupssetting.file_uuid, + "nodeuuid": groupssetting.nodeuuid, + "fragment": annotation.target.selector.value, + "textualbody": annotation.body?.value + } */ + } + } + return $this->argument_validated ?? FALSE; + } + + + /** + * Retrieves a list of all fields that contain in its path a Node Entity. + * + * @return string[] + * An options list of field identifiers mapped to their prefixed + * labels. + */ + protected function getSbfDenseVectorFields() { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + + $fields_info = $index->getFields(); + foreach ($fields_info as $field_id => $field) { + //if (($field->getDatasourceId() == 'strawberryfield_flavor_datasource') && ($field->getType() == "integer")) { + // Anything except text, fulltext or any solr_text variations. Also skip direct node id and UUIDs which would + // basically return the same ADO as input filtered, given that those are unique. + $property_path = $field->getPropertyPath(); + $datasource_id = $field->getDatasourceId(); + if (str_starts_with($field->getType(), 'densevector_') === TRUE) { + $field->getDataDefinition(); + $fields[$field_id] = $field->getPrefixedLabel() . '('. $field->getFieldIdentifier() .')'; + } + } + return $fields; + } + + protected function getSbfDenseVectorFieldSource($field_id) { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + $fields_info = $index->getField($field_id); + return $fields_info; + } + + protected function getExistingDenseVectorForImage($uri, $field) { + + } + + protected function is_base64($s){ + // Check if there are valid base64 characters + if (!preg_match('/^[a-zA-Z0-9\/\r\n+]*={0,2}$/', $s)) return false; + + // Decode the string in strict mode and check the results + $decoded = base64_decode($s, true); + if(false === $decoded) return false; + + // Encode the string again + if(base64_encode($decoded) != $s) return false; + + return true; + } + + /** + * @param \Drupal\search_api\Plugin\views\query\SearchApiQuery $query + * + * @throws \Drupal\search_api\SearchApiException + */ + protected function buildKNNQuery(SearchApiQuery $query, array $vector=[]):array|null { + // We can only use Solr kids. + $solr_query_string = []; + $backend = $query->getIndex()->getServerInstance()->getBackend(); + if (!($backend instanceof \Drupal\search_api_solr\SolrBackendInterface)) { + return FALSE; + } + $allfields_translated_to_solr = $backend + ->getSolrFieldNames($query->getIndex()); + if (isset($allfields_translated_to_solr[$this->options['sbf_fields']])) { + $solr_query_string[] = "{!knn f={$allfields_translated_to_solr[$this->options['sbf_fields']]} topK={$this->options['topk']}}[" . implode(', ', $vector) . ']'; + // {!knn f=vector topK=3}[-9.01364535e-03, -7.26634488e-02, -1.73818860e-02, ..., -1.16323479e-01] + } + return $solr_query_string; + } +} diff --git a/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php new file mode 100644 index 0000000..8296232 --- /dev/null +++ b/src/Plugin/views/filter/StrawberryRunnersMLImagefilter.php @@ -0,0 +1,715 @@ +setSbrEntityStorage( + $container->get('entity_type.manager')->getStorage('strawberry_runners_postprocessor') + ); + $plugin->setFieldsHelper($container->get('search_api.fields_helper')); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setCache($container->get('cache.default')); + $plugin->currentUser = $container->get('current_user'); + $plugin->strawberryRunnerUtilityService = $container->get( + 'strawberry_runner.utility' + ); + $plugin->strawberryRunnerProcessorPluginManager = $container->get( + 'strawberry_runner.processor_manager' + ); + return $plugin; + } + + + /** + * {@inheritdoc} + */ + public function defineOptions() { + $options = parent::defineOptions(); + $options['value']['default'] = []; + $options['sbf_fields'] = ['default' => NULL]; + $options['pre_query'] = ['default' => TRUE]; + $options['pre_query_facets'] = ['default' => TRUE]; + $options['topk'] = ['default' => 3]; + $options['ml_strawberry_postprocessor'] = ['default' => NULL]; + return $options; + } + + public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage) + { + $this->sbrEntityStorage = $sbrEntityStorage; + return $this; + } + + protected function canBuildGroup() { + return FALSE; + } + + /** + * {@inheritdoc} + */ + public function defaultExposeOptions() { + parent::defaultExposeOptions(); + $this->options['expose']['reduce'] = FALSE; + } + + protected function valueSubmit($form, FormStateInterface $form_state) { + $form_state = $form_state; + } + + protected function valueValidate($form, FormStateInterface $form_state) { + $form_state->setValue(['options', 'value'], []); + } + + + public function setFieldsHelper(FieldsHelperInterface $fieldsHelper) { + $this->fieldsHelper = $fieldsHelper; + return $this; + } + + /** + * Sets the View Storage. + * + * @param \Drupal\Core\Entity\EntityStorageInterface $viewstorage + * The view Storage. + * + * @return $this + */ + public function setViewStorage(EntityStorageInterface $viewstorage) { + $this->viewStorage = $viewstorage; + return $this; + } + + /** + * Sets the Cache Backed. + * + * @param \Drupal\Core\Cache\CacheBackendInterface $cache + * The cache backend. Use to store complex calculations of property paths. + * + * @return $this + */ + public function setCache(CacheBackendInterface $cache) { + $this->cache = $cache; + return $this; + } + + public function showOperatorForm(&$form, FormStateInterface $form_state) { + } + + /** + * {@inheritdoc} + */ + public function buildOptionsForm(&$form, FormStateInterface $form_state) { + parent::buildOptionsForm($form, $form_state); + $active_plugins = $this->strawberryRunnerUtilityService->getActivePluginConfigs(FALSE); + + foreach ($active_plugins as $by_source => $plugins) { + foreach ($plugins as $entity_id => $active_plugin) { + if (isset($active_plugin['ml_method'])) { + $post_processor_options[$entity_id] = $active_plugin['ml_method'] ."({$entity_id})"; + } + } + } + + $fields = $this->getSbfDenseVectorFields() ?? []; + $form['sbf_fields'] = [ + '#type' => 'select', + '#title' => $this->t( + 'KNN Dense Vector Field to query against' + ), + '#description' => $this->t( + 'Select the fields that will be used to query against.' + ), + '#options' => $fields, + '#multiple' => FALSE, + '#default_value' => $this->options['sbf_fields'], + '#required' => TRUE, + ]; + $form['pre_query'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query'], + '#title' => $this->t('Treat previous filters to this as pre queries (Future Feature)'), + '#description'=> $this->t( + 'If any other filter setup before this one will be treated as pre-queries to the actual KNN query.' + ), + '#disabled' => TRUE, + ]; + $form['pre_query_facets'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query_facets'], + '#title' => $this->t('Treat also facets, if any, as pre queries (Future Feature)'), + '#description'=> $this->t( + 'If any other facets will be treated as pre-queries to the actual KNN query.' + ), + '#disabled' => TRUE, + ]; + $form['topk'] = [ + '#type' => 'number', + '#default_value' => $this->options['topk'], + '#title' => $this->t('Top Similarity KNN hits to request to the backend.'), + '#description'=> $this->t( + 'The more, the slower' + ), + '#min' => 1, + '#max' => 100, + ]; + $form['ml_strawberry_postprocessor'] = [ + '#type' => 'select', + '#title' => $this->t( + 'Strawberry Runners processor to extract the on-the fly embedding' + ), + '#description' => $this->t( + 'Select the ML Strawberry Runners Processor that was used to index Vectors into the field you are going to search against. These need to match' + ), + '#options' => $post_processor_options, + '#multiple' => FALSE, + '#default_value' => $this->options['ml_strawberry_postprocessor'], + '#required' => TRUE, + ]; + } + /** + * Validate the options form. + */ + public function validateOptionsForm(&$form, FormStateInterface $form_state) { + // We need to validate that the selected field is of the same source/size as model that will + // be used to generate the on the fly vectors. + // So we need to load the SBR entity passed, compare the model against the constant present in + // \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE + // and then load the field and see if the source (is of the same SBFlavor property/size (vector_576, etc) + $valid = FALSE; + $options = $form_state->getValue('options'); + $processor_id = $options['ml_strawberry_postprocessor'] ?? NULL; + $field_id = $options['sbf_fields']; + if ($processor_id) { + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Note, we could also restrict to the same image mimetypes that the processor is setup to handle? + if (isset($sbr_config['ml_method'])) { + $vector_size = abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']] ?? ''; + $field_info = $this->getSbfDenseVectorFieldSource($field_id); + if ($field_info) { + // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. + $propath_pieces = explode('/', $field_info->getCombinedPropertyPath()); + if (!(end($propath_pieces) == 'vector_' .$vector_size && $field_info->getType() == 'densevector_' . $vector_size)) { + $form_state->setErrorByName('options][ml_strawberry_postprocessor', $this->t('The Field/Processor combination is not right. Make sure your Configured KNN Dense Vector Field and the Strawberry Processor are targeting the same Vector Dimensions (e.g first one is from a vector_576 data source property and the field type is densevector_576 and the processor is calling YOLO)')); + } + } + else { + // The field is gone. + $form_state->setErrorByName('options][sbf_fields', $this->t('CConfigured KNN Dense Vector Field does not longer exists. Please replace your config with a valid/indexed field.')); + } + } + } + } + } + + public function submitOptionsForm(&$form, FormStateInterface $form_state) { + parent::submitOptionsForm( + $form, $form_state + ); + } + + public function isExposed() + { + return parent::isExposed() && ((!$this->currentUser->isAnonymous() && $this->currentUser->hasPermission('execute Image ML queries')) || $this->currentUser->hasRole('administrator')); + } + + protected function valueForm(&$form, FormStateInterface $form_state) { + // At this stage $this->value is not set? + $this->value = is_array($this->value) ? $this->value : (array) $this->value; + if (!$form_state->get('exposed')) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('JSON used to query internal form'), + '#prefix' => '
', + '#suffix' => '
' + ]; + } + elseif ($this->isExposed()) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('JSON used to query public form'), + '#prefix' => '
', + '#suffix' => '
', + '#access' => !$this->currentUser->isAnonymous() && $this->currentUser->hasPermission('execute Image ML queries') || $this->currentUser->hasRole('administrator'), + ] ; + } + } + + public function hasExtraOptions() { + return FALSE; + } + + /** + * @inheritDoc + */ + protected function operatorForm(&$form, FormStateInterface $form_state) { + parent::operatorForm($form, $form_state); // TODO: Change the autogenerated stub + } + + + /** + * {@inheritdoc} + */ + public function buildExposeForm(&$form, FormStateInterface $form_state) { + parent::buildExposeForm($form, $form_state); + unset($form['expose']['reduce']); + } + + + public function query() { + if ($this->currentUser->isAnonymous() || (!$this->currentUser->hasPermission('execute Image ML queries') && !$this->currentUser->hasRole('administrator'))) { + return; + } + + if (empty($this->value) || empty($this->validated_exposed_input) || !$this->getQuery()) { + // basically not validated, not present as a value and also someone cancelled/nuklled the query before? + return; + } + /* + * $this->value = {stdClass} + iiif_image_id = "s3://3b9%2Fimage-dcpl-p034-npsncr-00015-rexported-f2c69aeb-7bcb-434a-a781-e580cb3695b7.tiff" + bbox = {stdClass} + x = {float} 0.0 + y = {float} 0.0 + w = {float} 1.0 + h = {float} 1.0 + */ + // Just to be sure here bc we have our own way. Who knows if some external code decides to alter the value + $this->value = $this->validated_exposed_input; + // We should only be at this stage if we have validation + if (!is_array($this->value)) { + $this->value = (array) $this->value; + } + // As always, start by processing all inline, then move to separate code for cleaner methods + // We need to load the SBR entity first here + $iiif_image_url = null; + $processor_id = $this->options['ml_strawberry_postprocessor']; + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Now we need to actually generate an instance of the runner using the config + $entity_id = $plugin_config_entity->id(); + $configuration_options = $plugin_config_entity->getPluginconfig(); + $configuration_options['configEntity'] = $entity_id; + /* @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface $plugin_instance */ + $plugin_instance + = $this->strawberryRunnerProcessorPluginManager->createInstance( + $plugin_config_entity->getPluginid(), + $configuration_options + ); + if ($plugin_instance instanceof abstractMLPostProcessor) { + $iiifidentifier = urlencode( + StreamWrapperManager::getTarget( $this->validated_exposed_input->iiif_image_id) ?? NULL + ); + if ($iiifidentifier == NULL || empty($iiifidentifier)) { + return; + } + // basically the whole image if no bbox will be used as default + // Now prep the image for fetching. First pass, just an ID, then deal with the UUID for the file option + // pct:x,y,w,h + // !w,h + $region = 'full'; + if (isset($this->validated_exposed_input->bbox->x)) { + $region = 'pct:'.($this->validated_exposed_input->bbox->x * 100).','.($this->validated_exposed_input->bbox->y * 100).','.($this->validated_exposed_input->bbox->w * 100).','.($this->validated_exposed_input->bbox->h * 100); + } + $iiif_image_url = $sbr_config['iiif_server']."/{$iiifidentifier}/{$region}/!640,640/0/default.jpg"; + try { + $response = $plugin_instance->callImageML($iiif_image_url, []); + } + catch (\Exception $exception) { + // Give user feedback + return; + } + if (!empty($response['error'])) { + // we should log this + return; + } + elseif (isset($response['message'])) { + // Now here is an issue. Each endpoint will return the vector inside a yolo/etc. + // We should change that and make it generic (requires new pythong code/rebuilding NLP container) + // so for now i will use the ml method config split/last to get the right key. + foreach (["error","message","web64"] as $remove) { + unset($response[$remove]); + } + $all_knns = $this->getQuery()->getOption('sbf_knn') ?? []; + foreach ($response as $endpoint_key => $values) { + if (isset($values['vector']) && is_array($values['vector']) && count($values['vector']) == abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE[$sbr_config['ml_method']]) { + $all_knns[] = $this->buildKNNQuery($this->getQuery(), $values['vector']); + } + } + array_filter($all_knns); + if (count($all_knns)) { + $this->getQuery()->setOption('sbf_knn', $all_knns); + } + } + } + } + if (!$iiif_image_url) { + return; + } + return; + } + + + public function validate() { + + // For values passed by direct reference we will require/assume + // $json_for_url = base64_encode(gzcompress($json)); + // And this operation will happen on reading/setting back and forth. + $errors = parent::validate(); + if (is_array($this->value)) { + if ($this->options['exposed'] && !$this->options['expose']['required'] + && empty($this->value) + ) { + // Don't validate if the field is exposed and no default value is provided. + return $errors; + } + // Choose different kind of output for 0, a single and multiple values. + if (count($this->value) == 0) { + $errors[] = $this->t( + 'No valid values found on filter: @filter.', + ['@filter' => $this->adminLabel(TRUE)] + ); + } + } + return $errors; + } + + public function validateExposed(&$form, FormStateInterface $form_state) { + // Only validate exposed input. + // In theory this is where i can alter the actual form state input + // to set a different URL argument? compress? + if (empty($this->options['exposed']) + || empty($this->options['expose']['identifier']) + ) { + return; + } + + $this->validated_exposed_input = NULL; + $identifier = $this->options['expose']['identifier']; + $input = $form_state->getValue($identifier); + if (is_string($input)) { + trim($input); + if (strlen($input) == 0) { + return; + } + } + $values = (array) $input; + if ($values) { + if ($this->isExposed()) { + // If already JSON + $json_input = StrawberryfieldJsonHelper::isValidJsonSchema($values[0], static::IMAGEML_INPUT_SCHEMA); + if ($json_input) { + // Probably not the place to compress the data for the URL? + $encoded = base64_encode(gzcompress($values[0])); + $form_state->setValue($identifier, $encoded); + $this->validated_exposed_input = $json_input; + } + elseif ($this->is_base64($values[0])) { + $decoded = gzuncompress(base64_decode($values[0])); + if ($decoded !== FALSE) { + $json_input = StrawberryfieldJsonHelper::isValidJsonSchema($values[0], static::IMAGEML_INPUT_SCHEMA); + if ($json_input !== FALSE) { + $this->validated_exposed_input = $json_input; + } + } + } + } + if (!$this->validated_exposed_input) { + // Check if the JSON is the right structure. + $form_state->setErrorByName($identifier, $this->t("Wrong format for the ML Image filter input")); + } + else { + if ($this->validated_exposed_input->iiif_image_id && !(empty($this->validated_exposed_input->iiif_image_id))) { + $image_id = StreamWrapperManager::getTarget($this->validated_exposed_input->iiif_image_id); + // means passed without a streamwrapper + if (!$image_id) { + $form_state->setErrorByName($identifier, $this->t("Wrong format for the ML IIIF Image ID property. Make sure it contains a streamwrapper (e.g s3://)")); + } + } + } + } + else { + // Do for non exposed. Should be directly a JSON? + } + } + + + + public function acceptExposedInput($input) { + // Called during the form submit itself.. + $rc = parent::acceptExposedInput($input); + // a False means it won't be included/alter the generated query. + // This is useful! + if ($rc) { + // If we have previously validated input, override. + if (isset($this->validated_exposed_input)) { + $this->value = $this->validated_exposed_input; + } + else { + $this->value = NULL; + } + } + return $rc; + } + + /** + * @inheritDoc + */ + public function submitExposed(&$form, FormStateInterface $form_state) + { + parent::submitExposed($form, $form_state); // TODO: Change the autogenerated stub + } + + + /** + * Retrieves a list of all fields that contain in its path a Node Entity. + * + * @return string[] + * An options list of field identifiers mapped to their prefixed + * labels. + */ + protected function getSbfDenseVectorFields() { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + + $fields_info = $index->getFields(); + foreach ($fields_info as $field_id => $field) { + //if (($field->getDatasourceId() == 'strawberryfield_flavor_datasource') && ($field->getType() == "integer")) { + // Anything except text, fulltext or any solr_text variations. Also skip direct node id and UUIDs which would + // basically return the same ADO as input filtered, given that those are unique. + $property_path = $field->getPropertyPath(); + $datasource_id = $field->getDatasourceId(); + if (str_starts_with($field->getType(), 'densevector_') === TRUE) { + $field->getDataDefinition(); + $fields[$field_id] = $field->getPrefixedLabel() . '('. $field->getFieldIdentifier() .')'; + } + } + return $fields; + } + + protected function getSbfDenseVectorFieldSource($field_id) { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + $fields_info = $index->getField($field_id); + return $fields_info; + } + + protected function getExistingDenseVectorForImage($uri, $field) { + + } + + protected function is_base64($s){ + // Check if there are valid base64 characters + if (!preg_match('/^[a-zA-Z0-9\/\r\n+]*={0,2}$/', $s)) return false; + + // Decode the string in strict mode and check the results + $decoded = base64_decode($s, true); + if(false === $decoded) return false; + + // Encode the string again + if(base64_encode($decoded) != $s) return false; + + return true; + } + + /** + * @param \Drupal\search_api\Plugin\views\query\SearchApiQuery $query + * + * @throws \Drupal\search_api\SearchApiException + */ + protected function buildKNNQuery(SearchApiQuery $query, array $vector=[]):array|null { + // We can only use Solr kids. + $solr_query_string = []; + $backend = $query->getIndex()->getServerInstance()->getBackend(); + if (!($backend instanceof \Drupal\search_api_solr\SolrBackendInterface)) { + return FALSE; + } + $allfields_translated_to_solr = $backend + ->getSolrFieldNames($query->getIndex()); + if (isset($allfields_translated_to_solr[$this->options['sbf_fields']])) { + $solr_query_string[] = "{!knn f={$allfields_translated_to_solr[$this->options['sbf_fields']]} topK={$this->options['topk']}}[" . implode(', ', $vector) . ']'; + // {!knn f=vector topK=3}[-9.01364535e-03, -7.26634488e-02, -1.73818860e-02, ..., -1.16323479e-01] + } + return $solr_query_string; + } +} diff --git a/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php new file mode 100644 index 0000000..b5a74bd --- /dev/null +++ b/src/Plugin/views/filter/StrawberryRunnersMLTextfilter.php @@ -0,0 +1,570 @@ +setSbrEntityStorage( + $container->get('entity_type.manager')->getStorage('strawberry_runners_postprocessor') + ); + $plugin->setFieldsHelper($container->get('search_api.fields_helper')); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setViewStorage( + $container->get('entity_type.manager')->getStorage('view') + ); + $plugin->setCache($container->get('cache.default')); + $plugin->currentUser = $container->get('current_user'); + $plugin->strawberryRunnerUtilityService = $container->get( + 'strawberry_runner.utility' + ); + $plugin->strawberryRunnerProcessorPluginManager = $container->get( + 'strawberry_runner.processor_manager' + ); + return $plugin; + } + + + /** + * {@inheritdoc} + */ + public function defineOptions() { + $options = parent::defineOptions(); + $options['value']['default'] = []; + $options['sbf_fields'] = ['default' => NULL]; + $options['pre_query'] = ['default' => TRUE]; + $options['pre_query_facets'] = ['default' => TRUE]; + $options['topk'] = ['default' => 3]; + $options['ml_strawberry_postprocessor'] = ['default' => NULL]; + return $options; + } + + public function setSbrEntityStorage(EntityStorageInterface $sbrEntityStorage) + { + $this->sbrEntityStorage = $sbrEntityStorage; + return $this; + } + + protected function canBuildGroup() { + return FALSE; + } + + /** + * {@inheritdoc} + */ + public function defaultExposeOptions() { + parent::defaultExposeOptions(); + $this->options['expose']['reduce'] = FALSE; + } + + protected function valueSubmit($form, FormStateInterface $form_state) { + $form_state = $form_state; + } + + protected function valueValidate($form, FormStateInterface $form_state) { + $form_state->setValue(['options', 'value'], []); + } + + + public function setFieldsHelper(FieldsHelperInterface $fieldsHelper) { + $this->fieldsHelper = $fieldsHelper; + return $this; + } + + /** + * Sets the View Storage. + * + * @param \Drupal\Core\Entity\EntityStorageInterface $viewstorage + * The view Storage. + * + * @return $this + */ + public function setViewStorage(EntityStorageInterface $viewstorage) { + $this->viewStorage = $viewstorage; + return $this; + } + + /** + * Sets the Cache Backed. + * + * @param \Drupal\Core\Cache\CacheBackendInterface $cache + * The cache backend. Use to store complex calculations of property paths. + * + * @return $this + */ + public function setCache(CacheBackendInterface $cache) { + $this->cache = $cache; + return $this; + } + + public function showOperatorForm(&$form, FormStateInterface $form_state) { + } + + /** + * {@inheritdoc} + */ + public function buildOptionsForm(&$form, FormStateInterface $form_state) { + parent::buildOptionsForm($form, $form_state); + $active_plugins = $this->strawberryRunnerUtilityService->getActivePluginConfigs(FALSE); + + foreach ($active_plugins as $by_source => $plugins) { + foreach ($plugins as $entity_id => $active_plugin) { + if (isset($active_plugin['ml_method'])) { + if (in_array($active_plugin['ml_method'], array_keys(abstractMLPostProcessor::ML_TEXT_VECTOR_SIZE))){ + $post_processor_options[$entity_id] = $active_plugin['ml_method'] . "({$entity_id})"; + } + } + } + } + + $fields = $this->getSbfDenseVectorFields() ?? []; + $form['sbf_fields'] = [ + '#type' => 'select', + '#title' => $this->t( + 'KNN Dense Vector Field to query against' + ), + '#description' => $this->t( + 'Select the fields that will be used to query against.' + ), + '#options' => $fields, + '#multiple' => FALSE, + '#default_value' => $this->options['sbf_fields'], + '#required' => TRUE, + ]; + $form['pre_query'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query'], + '#title' => $this->t('Treat previous filters to this as pre queries (Future Feature)'), + '#description'=> $this->t( + 'If any other filter setup before this one will be treated as pre-queries to the actual KNN query.' + ), + '#disabled' => TRUE, + ]; + $form['pre_query_facets'] = [ + '#type' => 'checkbox', + '#default_value' => $this->options['pre_query_facets'], + '#title' => $this->t('Treat also facets, if any, as pre queries (Future Feature)'), + '#description'=> $this->t( + 'If any other facets will be treated as pre-queries to the actual KNN query.' + ), + '#disabled' => TRUE, + ]; + $form['topk'] = [ + '#type' => 'number', + '#default_value' => $this->options['topk'], + '#title' => $this->t('Top Similarity KNN hits to request to the backend.'), + '#description'=> $this->t( + 'The more, the slower' + ), + '#min' => 1, + '#max' => 100, + ]; + $form['ml_strawberry_postprocessor'] = [ + '#type' => 'select', + '#title' => $this->t( + 'Strawberry Runners processor to extract the on-the fly embedding' + ), + '#description' => $this->t( + 'Select the ML Strawberry Runners Processor that was used to index Vectors into the field you are going to search against. These need to match' + ), + '#options' => $post_processor_options, + '#multiple' => FALSE, + '#default_value' => $this->options['ml_strawberry_postprocessor'], + '#required' => TRUE, + ]; + } + /** + * Validate the options form. + */ + public function validateOptionsForm(&$form, FormStateInterface $form_state) { + // We need to validate that the selected field is of the same source/size as model that will + // be used to generate the on the fly vectors. + // So we need to load the SBR entity passed, compare the model against the constant present in + // \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor\abstractMLPostProcessor::ML_IMAGE_VECTOR_SIZE + // and then load the field and see if the source (is of the same SBFlavor property/size (vector_576, etc) + $valid = FALSE; + $options = $form_state->getValue('options'); + $processor_id = $options['ml_strawberry_postprocessor'] ?? NULL; + $field_id = $options['sbf_fields']; + if ($processor_id) { + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Note, we could also restrict to the same image mimetypes that the processor is setup to handle? + if (isset($sbr_config['ml_method'])) { + $vector_size = abstractMLPostProcessor::ML_TEXT_VECTOR_SIZE[$sbr_config['ml_method']] ?? ''; + $field_info = $this->getSbfDenseVectorFieldSource($field_id); + if ($field_info) { + // We do allow mixed data sources. One can be a node of course even if the source is a flavor. This is because each source could inherit properties from the other. + $propath_pieces = explode('/', $field_info->getCombinedPropertyPath()); + if (!(end($propath_pieces) == 'vector_' .$vector_size && $field_info->getType() == 'densevector_' . $vector_size)) { + $form_state->setErrorByName('options][ml_strawberry_postprocessor', $this->t('The Field/Processor combination is not right. Make sure your Configured KNN Dense Vector Field and the Strawberry Processor are targeting the same Vector Dimensions (e.g first one is from a vector_576 data source property and the field type is densevector_576 and the processor is calling YOLO)')); + } + } + else { + // The field is gone. + $form_state->setErrorByName('options][sbf_fields', $this->t('CConfigured KNN Dense Vector Field does not longer exists. Please replace your config with a valid/indexed field.')); + } + } + } + } + } + + public function submitOptionsForm(&$form, FormStateInterface $form_state) { + parent::submitOptionsForm( + $form, $form_state + ); + } + + /** + * @inheritDoc + */ + public function isExposed() + { + return parent::isExposed() && ((!$this->currentUser->isAnonymous() && $this->currentUser->hasPermission('execute Text ML queries')) || $this->currentUser->hasRole('administrator')); + } + + + protected function valueForm(&$form, FormStateInterface $form_state) { + // At this stage $this->value is not set? + $this->value = is_array($this->value) ? $this->value : (array) $this->value; + if (!$form_state->get('exposed')) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('Text query to be Vectorized'), + '#prefix' => '
', + '#suffix' => '
' + ]; + } + elseif ($this->isExposed() ) { + $form['value'] = [ + '#type' => 'textarea', + '#title' => t('Text query to be vectorized'), + '#prefix' => '
', + '#suffix' => '
', + '#access' => !$this->currentUser->isAnonymous() && $this->currentUser->hasPermission('execute Text ML queries') || $this->currentUser->hasRole('administrator'), + ] ; + } + } + + public function hasExtraOptions() { + return FALSE; + } + + /** + * @inheritDoc + */ + protected function operatorForm(&$form, FormStateInterface $form_state) { + parent::operatorForm($form, $form_state); // TODO: Change the autogenerated stub + } + + + /** + * {@inheritdoc} + */ + public function buildExposeForm(&$form, FormStateInterface $form_state) { + parent::buildExposeForm($form, $form_state); + unset($form['expose']['reduce']); + } + + + public function query() { + if (empty($this->value) || empty($this->validated_exposed_input) || !$this->getQuery() || + ($this->currentUser->isAnonymous() || (!$this->currentUser->hasPermission('execute Text ML queries') && !$this->currentUser->hasRole('administrator'))) + ) { + // basically not validated, not present as a value or not the right permisisons. + return; + } + + // Just to be sure here bc we have our own way. Who knows if some external code decides to alter the value + $this->value = $this->validated_exposed_input; + // We should only be at this stage if we have validation + if (is_array($this->value) && !empty($this->value)) { + $this->value = array_values($this->value); + $this->value = reset($this->value); + } + if (is_string($this->value)) { + $this->value = trim($this->value); + } + if (empty($this->value)) { + return; + } + + + $processor_id = $this->options['ml_strawberry_postprocessor']; + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity|null */ + $plugin_config_entity = $this->sbrEntityStorage->load($processor_id); + if ($plugin_config_entity->isActive()) { + $sbr_config = $plugin_config_entity->getPluginconfig(); + // Now we need to actually generate an instance of the runner using the config + $entity_id = $plugin_config_entity->id(); + $configuration_options = $plugin_config_entity->getPluginconfig(); + $configuration_options['configEntity'] = $entity_id; + /* @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface $plugin_instance */ + $plugin_instance + = $this->strawberryRunnerProcessorPluginManager->createInstance( + $plugin_config_entity->getPluginid(), + $configuration_options + ); + if ($plugin_instance instanceof abstractMLPostProcessor) { + try { + $response = $plugin_instance->callTextML($this->value, TRUE); + } + catch (\Exception $exception) { + // Give user feedback + return; + } + if (!empty($response['error'])) { + // we should log this + return; + } + elseif (isset($response['message'])) { + // Now here is an issue. Each endpoint will return the vector inside a yolo/etc. + // We should change that and make it generic (requires new pythong code/rebuilding NLP container) + // so for now i will use the ml method config split/last to get the right key. + foreach (["error","message","web64"] as $remove) { + unset($response[$remove]); + } + $all_knns = $this->getQuery()->getOption('sbf_knn') ?? []; + foreach ($response as $endpoint_key => $values) { + if (isset($values['vector']) && is_array($values['vector']) && count($values['vector']) == abstractMLPostProcessor::ML_TEXT_VECTOR_SIZE[$sbr_config['ml_method']]) { + $all_knns[] = $this->buildKNNQuery($this->getQuery(), $values['vector']); + } + } + array_filter($all_knns); + if (count($all_knns)) { + $this->getQuery()->setOption('sbf_knn', $all_knns); + } + } + } + } + return; + } + + + public function validate() { + + // For values passed by direct reference we will require/assume + // $json_for_url = base64_encode(gzcompress($json)); + // And this operation will happen on reading/setting back and forth. + $errors = parent::validate(); + if (is_array($this->value)) { + if ($this->options['exposed'] && !$this->options['expose']['required'] + && empty($this->value) + ) { + // Don't validate if the field is exposed and no default value is provided. + return $errors; + } + // Choose different kind of output for 0, a single and multiple values. + if (count($this->value) == 0) { + $errors[] = $this->t( + 'No valid values found on filter: @filter.', + ['@filter' => $this->adminLabel(TRUE)] + ); + } + } + return $errors; + } + + public function validateExposed(&$form, FormStateInterface $form_state) { + // Only validate exposed input. + // In theory this is where i can alter the actual form state input + // to set a different URL argument? compress? + if (empty($this->options['exposed']) + || empty($this->options['expose']['identifier']) + ) { + return; + } + + $this->validated_exposed_input = NULL; + $identifier = $this->options['expose']['identifier']; + $input = $form_state->getValue($identifier); + if (is_string($input)) { + trim($input); + if (strlen($input) == 0) { + return; + } + } + $values = $input; + if ($values) { + $this->validated_exposed_input = $values; + } + } + + + + public function acceptExposedInput($input) { + // Called during the form submit itself.. + $rc = parent::acceptExposedInput($input); + // a False means it won't be included/alter the generated query. + // This is useful! + if ($rc) { + // If we have previously validated input, override. + if (isset($this->validated_exposed_input)) { + $this->value = $this->validated_exposed_input; + } + else { + $this->value = NULL; + } + } + return $rc; + } + + /** + * @inheritDoc + */ + public function submitExposed(&$form, FormStateInterface $form_state) + { + parent::submitExposed($form, $form_state); // TODO: Change the autogenerated stub + } + + + /** + * Retrieves a list of all fields that contain in its path a Node Entity. + * + * @return string[] + * An options list of field identifiers mapped to their prefixed + * labels. + */ + protected function getSbfDenseVectorFields() { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + + $fields_info = $index->getFields(); + foreach ($fields_info as $field_id => $field) { + //if (($field->getDatasourceId() == 'strawberryfield_flavor_datasource') && ($field->getType() == "integer")) { + // Anything except text, fulltext or any solr_text variations. Also skip direct node id and UUIDs which would + // basically return the same ADO as input filtered, given that those are unique. + $property_path = $field->getPropertyPath(); + $datasource_id = $field->getDatasourceId(); + if (str_starts_with($field->getType(), 'densevector_') === TRUE) { + $field->getDataDefinition(); + $fields[$field_id] = $field->getPrefixedLabel() . '('. $field->getFieldIdentifier() .')'; + } + } + return $fields; + } + + protected function getSbfDenseVectorFieldSource($field_id) { + $fields = []; + /** @var \Drupal\search_api\IndexInterface $index */ + $index = Index::load(substr($this->table, 17)); + $fields_info = $index->getField($field_id); + return $fields_info; + } + + /** + * @param \Drupal\search_api\Plugin\views\query\SearchApiQuery $query + * + * @throws \Drupal\search_api\SearchApiException + */ + protected function buildKNNQuery(SearchApiQuery $query, array $vector=[]):array|null { + // We can only use Solr kids. + $solr_query_string = []; + $backend = $query->getIndex()->getServerInstance()->getBackend(); + if (!($backend instanceof \Drupal\search_api_solr\SolrBackendInterface)) { + return FALSE; + } + $allfields_translated_to_solr = $backend + ->getSolrFieldNames($query->getIndex()); + if (isset($allfields_translated_to_solr[$this->options['sbf_fields']])) { + $solr_query_string[] = "{!knn f={$allfields_translated_to_solr[$this->options['sbf_fields']]} topK={$this->options['topk']}}[" . implode(', ', $vector) . ']'; + // {!knn f=vector topK=3}[-9.01364535e-03, -7.26634488e-02, -1.73818860e-02, ..., -1.16323479e-01] + } + return $solr_query_string; + } +} diff --git a/src/strawberryRunnerUtilityService.php b/src/strawberryRunnerUtilityService.php index ba76358..7cebdec 100644 --- a/src/strawberryRunnerUtilityService.php +++ b/src/strawberryRunnerUtilityService.php @@ -2,6 +2,7 @@ namespace Drupal\strawberry_runners; +use Drupal\Component\Plugin\Exception\PluginException; use Drupal\Core\Config\ConfigFactoryInterface; use Drupal\Core\Queue\QueueFactory; use Drupal\Core\Entity\ContentEntityInterface; @@ -158,7 +159,10 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf "weight" => "0" "configEntity" => "test" ]*/ + // @TODO document how an ADO processor config would look like. $askeymap = []; + $jmespathsmap = []; + //Plugins that run on Files attached to an ADO if (isset($active_plugins['entity:file'])) { foreach ($active_plugins['entity:file'] as $activePluginId => $config) { // Only add to $askeymap if $filter is empty or $activePluginId is in the $filter. @@ -172,6 +176,21 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf } } } + //Plugins that run on Metadata of an ADO + if (isset($active_plugins['entity:node'])) { + foreach ($active_plugins['entity:node'] as $activePluginId => $config) { + if (empty($filter) || in_array($activePluginId, $filter)) { + if ($config['source_type'] == 'ado') { + $jmespaths = array_filter($config['jmespath'] ?? []); + // This assumes we always need a jmespath. Review the idea of "all" without a jmespath + // which is really an *. But for now we prefer the "verbosity" of having one. + foreach ($jmespaths as $key => $value) { + $jmespathsmap[$key][$activePluginId] = $config; + } + } + } + } + } foreach ($sbf_fields as $field_name) { /* @var $field \Drupal\Core\Field\FieldItemInterface */ @@ -188,6 +207,7 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf if (isset($flatvalues['type'])) { $sbf_type = (array) $flatvalues['type']; } + // File level plugins $plugin_definition['input_type'] == entity:file foreach ($askeymap as $jsonkey => $activePlugins) { if (isset($flatvalues[$jsonkey])) { foreach ($flatvalues[$jsonkey] as $uniqueid => $asstructure) { @@ -298,19 +318,91 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf } } } + // JSON/Metadata level plugins coming from ADO JSON directly , $plugin_definition['input_type'] == entity:node + foreach ($jmespathsmap as $jmespath => $activePlugins) { + // evaluate the jmespath + $metadata_from_json = NULL; + if (is_string($jmespath)) { + $metadata_from_json = $itemfield->searchPath($jmespath); + } + if (!empty($metadata_from_json)) { + foreach ($activePlugins as $activePluginId => $config) { + // Checks if the flag is set and is an array. + $nopost = (isset($flatvalues["ap:tasks"]["ap:nopost"]) && + is_array($flatvalues["ap:tasks"]["ap:nopost"])); + + if ($nopost) { + if (in_array($activePluginId, $flatvalues["ap:tasks"]["ap:nopost"])) { + // if we have an entry like ["ap:tasks"]["ap:nopost"][0] == "pager" we don't run pager + // for this ADO. We won't delete existing ones. Just never process. + continue; + } + } + // @TODO how to avoid running on metadata generated by a processor? + // We should limit where that metadata goes. Should never be the same ADO? + $valid_ado_type = explode(',', $config['ado_type']); + $valid_ado_type = array_map('trim', $valid_ado_type); + if (empty($config['ado_type']) + || count( + array_intersect($valid_ado_type, $sbf_type) + ) > 0 + ) { + + $data = new \stdClass(); + $data->fid = NULL; + $data->nid = $entity->id(); + $data->nuuid = $entity->uuid(); + $data->field_name = $field_name; + $data->field_delta = $delta; + // There is no cleanup needed here, so we avoid even triggering the option + // by saying. This was cleaned up before. + $data->sbr_cleanedup_before = TRUE; + // Get the configured Language from descriptive metadata + if (isset($config['language_key']) + && !empty($config['language_key']) + && isset($flatvalues[$config['language_key']]) + ) { + $data->lang = is_array( + $flatvalues[$config['language_key']] + ) ? array_values($flatvalues[$config['language_key']]) + : [$flatvalues[$config['language_key']]]; + } + else { + $data->lang = $config['language_default'] ?? NULL; + } + // Check if there is a key that forces processing. + $force_from_metadata_or_arg + = isset($flatvalues["ap:tasks"]["ap:forcepost"]) + ? (bool) $flatvalues["ap:tasks"]["ap:forcepost"] + : $force; + // In this case metadata is the actual metadata as filtered by a JMESPATH. + // We double nest, so we can add a sibling key "checksum" + // NOTE: means when a processor needs to access metadata it will have to digg inside a "json" + // key. Same if it generates data for a nested processor. + $data->metadata = ["json" => $metadata_from_json, "checksum" => md5($metadata_from_json)]; + $data->force = $force_from_metadata_or_arg; + $data->plugin_config_entity_id = $activePluginId; + $this->queueFactory->get( + 'strawberryrunners_process_index', TRUE + )->createItem($data); + } + } + } + } } } } } /** - * Gets all Currently Active PLugin Entities and Configs initialized - * + * Gets all Currently Active Plugin Entities and Configs initialized * + * @param bool $onlyRoot + * TRUE means we only get Top/first call Processors. FALSE, any processor at any level. * @return array - * @throws \Drupal\Component\Plugin\Exception\PluginException + * @throws PluginException */ - public function getActivePluginConfigs():array { + public function getActivePluginConfigs($onlyRoot = TRUE):array { $active_plugins = []; /* @var $plugin_config_entities \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity[] */ $plugin_config_entities = $this->entityTypeManager->getListBuilder( @@ -320,7 +412,7 @@ public function getActivePluginConfigs():array { foreach ($plugin_config_entities as $plugin_config_entity) { // Only get first level (no Parents) and Active ones. if ($plugin_config_entity->isActive() - && $plugin_config_entity->getParent() == '' + && (($onlyRoot && $plugin_config_entity->getParent() == '') || (!$onlyRoot)) ) { $entity_id = $plugin_config_entity->id(); $configuration_options = $plugin_config_entity->getPluginconfig(); diff --git a/src/strawberryRunnerUtilityServiceInterface.php b/src/strawberryRunnerUtilityServiceInterface.php index 8981c43..ca90e20 100644 --- a/src/strawberryRunnerUtilityServiceInterface.php +++ b/src/strawberryRunnerUtilityServiceInterface.php @@ -25,4 +25,14 @@ public function invokeProcessorForAdo(ContentEntityInterface $entity, array $sbf_fields, bool $force = FALSE, array $filter = [] ): void; + /** + * Gets all Currently Active PLugin Entities and Configs initialized + * + * @param bool $onlyRoot + * TRUE means we only get Top/first call Processors. FALSE, any processor at any level. + * @return array + * @throws \Drupal\Component\Plugin\Exception\PluginException + */ + public function getActivePluginConfigs($onlyRoot = TRUE):array; + } diff --git a/strawberry_runners.module b/strawberry_runners.module index ff7bedb..f4e2ef5 100644 --- a/strawberry_runners.module +++ b/strawberry_runners.module @@ -4,9 +4,63 @@ * Contains strawberryfield.module. */ -use Drupal\Core\Form\FormStateInterface; -use Drupal\Core\Url; -use Drupal\webform\WebformSubmissionForm; -use Drupal\Core\Entity\ContentEntityInterface; -use Drupal\file\Entity\File; -use Drupal\node\NodeInterface; +use Drupal\Core\Utility\Error as ErrorAlias; +use Drupal\search_api\Entity\Index; + + + +/** + * Implements hook_views_data_alter(). + */ +function strawberry_runners_views_data_alter(array &$data) { + //@see search_api_views_data() + /** @var \Drupal\search_api\IndexInterface $index */ + foreach (Index::loadMultiple() as $index) { + try { + $key = 'search_api_index_' . $index->id(); + $table = &$data[$key]; + + $ml_image_filter = _search_api_views_find_field_alias('sbr_imageml_filter', $table); + $ml_text_filter = _search_api_views_find_field_alias('sbr_textml_filter', $table); + $table[$ml_image_filter] = [ + 'title' => t('Image Similarity Filter via KNN (Experimental)'), + 'group' => t('Search'), + 'help' => t('Filters one or more Images belonging to an ADO against the Corresponding Vector in a Strawberry Flavor Document generating on the Fly an Embedding Vector.'), + 'filter' => [ + 'title' => t('Image Similarity Filter via KNN '), + 'field' => 'id', + 'id' => 'sbr_imageml_filter', + ], + 'argument' => [ + 'title' => t('Image Similarity Filter via KNN '), + 'field' => 'id', + 'id' => 'sbr_imageml_filter', + 'disable_break_phrase' => TRUE, // Disallows multiple values for ML fields + ], + ]; + if ($ml_image_filter != 'sbr_imageml_filter') { + $table[$ml_image_filter]['real field'] = 'sbr_imageml_filter'; + } + $table[$ml_text_filter] = [ + 'title' => t('Text Similarity Filter via KNN (Experimental)'), + 'group' => t('Search'), + 'help' => t('Filters one Query Phrases to the Corresponding Vector in a Strawberry Flavor Document generating on the Fly an Embedding Vector.'), + 'filter' => [ + 'title' => t('Text Similarity Filter via KNN '), + 'field' => 'id', + 'id' => 'sbr_textml_filter', + ], + ]; + if ($ml_text_filter != 'sbr_textml_filter') { + $table[$ml_image_filter]['real field'] = 'sbr_textml_filter'; + } + } + catch (\Exception $e) { + $args = [ + '%index' => $index->label(), + ]; + ErrorAlias::logException('strawberry_runners', $e, '%type while computing Views data for index %index: @message in %function (line %line of %file).', $args); + } + } + return $data; +} diff --git a/strawberry_runners.permissions.yml b/strawberry_runners.permissions.yml new file mode 100644 index 0000000..d53663d --- /dev/null +++ b/strawberry_runners.permissions.yml @@ -0,0 +1,6 @@ +'execute Image ML queries': + title: 'Execute Image ML queries (KNN). This permission is enforced by the ML Image Views Argument Plugin and Filter Plugin' + description: 'Only users with this permission will have Image based ML Views queries executed when using ML Image Views Argument(s). For security/performance reasons, this permission has no effect on Anonymous Users.' +'execute Text ML queries': + title: 'Execute Text ML queries (KNN). This permission is enforced by the ML Text Views Filter Plugin' + description: 'Only users with this permission will have Text based ML Views queries executed when using the ML Text Views Filter. For security/performance reasons, this permission has no effect on Anonymous Users.'