diff --git a/library/Application/Console/App.php b/library/Application/Console/App.php index 895f3101a..e3da9f7a6 100644 --- a/library/Application/Console/App.php +++ b/library/Application/Console/App.php @@ -40,6 +40,8 @@ /** * Command line application for OPUS 4 management tasks. + * + * TODO get list of Commands from configuration/registration (allow modules to add commands, decentralize the code) */ class Application_Console_App extends Application { @@ -53,7 +55,11 @@ public function __construct() $this->add(new ExtractFileCommand()); // $this->add(new Application_Console_Index_RepairCommand()); // $this->add(new Application_Console_Index_CheckCommand()); + $this->add(new Application_Console_Document_DeleteCommand()); + $this->add(new Application_Console_Document_DuplicatesCommand()); + $this->add(new Application_Console_Document_DiffCommand()); + $this->add(new BibtexImportCommand()); $this->add(new BibtexListCommand()); $this->add(new Application_Console_Debug_DocumentXmlCommand()); diff --git a/library/Application/Console/Document/DiffCommand.php b/library/Application/Console/Document/DiffCommand.php new file mode 100644 index 000000000..5d2233388 --- /dev/null +++ b/library/Application/Console/Document/DiffCommand.php @@ -0,0 +1,209 @@ +--doi Option, the database is +searched for matching documents. If more than one is found, the differences +between the documents are reported. + +NOTES: +- Complex values like persons and patents are not shown with all their metadata. +- For text values, like titles and abstracts, the exact differences are not + highlighted. The report just shows that the values are different. + +EOT; + + $this->setName('document:diff') + ->setDescription('Shows differences between documents.') + ->setHelp($help) + ->addOption( + self::OPTION_DOI, + null, + InputOption::VALUE_REQUIRED, + 'DOI value' + ) + ->addOption( + self::OPTION_SERVER_STATE, + 's', + InputOption::VALUE_REQUIRED, + 'Include docs in state (DOI) - unpublished, published, inprogress, audited, restricted, deleted' + ) + ->addOption( + self::OPTION_IGNORE_DELETED, + null, + InputOption::VALUE_NONE, + 'Ignore deleted documents (DOI)' + ) + ->addArgument( + self::ARGUMENT_DOC_ID, + InputArgument::OPTIONAL + InputArgument::IS_ARRAY, + 'Two or more document IDs' + ); + } + + /** + * @return int + * @throws NotFoundException + */ + protected function execute(InputInterface $input, OutputInterface $output) + { + $diff = new Application_Document_DocumentDiff(); + $diff->setOutput($output); + + $docIds = $this->getDocumentsInput($input, $output); + + $docCount = count($docIds); + + if ($docCount === 0) { + $output->writeln('No documents for comparison'); + } elseif ($docCount === 1) { + $docId = $docIds[0]; + $output->writeln("Only one document for comparison (ID = $docId)"); + } else { + $diff->diff($docIds); + } + + return 0; + } + + /** + * @param InputInterface $input + * @param OutputInterface $output + * @return int[] + * @throws ConfigException + */ + protected function getDocumentsInput($input, $output) + { + $docIds = $input->getArgument(self::ARGUMENT_DOC_ID); + + if (count($docIds) === 0) { + $doi = $input->getOption(self::OPTION_DOI); + + if ($doi !== null) { + $finder = Repository::getInstance()->getDocumentFinder(); + $finder->setIdentifierValue('doi', $doi); + + $serverStates = $this->getServerStateInput($input, $output); + + if ($serverStates !== null && count($serverStates) < 6) { + $finder->setServerState($serverStates); + } + + $docIds = $finder->getIds(); + + if (count($docIds) === 0) { + $output->writeln("No documents found for DOI: $doi"); + } + } + } + + return $docIds; + } + + /** + * @param InputInterface $input + * @param OutputInterface $output + * @return string[]|null + */ + protected function getServerStateInput($input, $output) + { + $validStates = $this->getValidServerStates(); + + if ($input->getOption(self::OPTION_IGNORE_DELETED)) { + unset($validStates['deleted']); + } + + $serverStateOption = $input->getOption(self::OPTION_SERVER_STATE); + + $serverStates = []; + + if ($serverStateOption !== null) { + $values = explode(',', $serverStateOption); + foreach ($values as $state) { + if (in_array(strtolower($state), $validStates)) { + $serverStates[$state] = $state; + } else { + $output->writeln("Invalid ServerState value: $state"); + } + } + } else { + $serverStates = $validStates; + } + + return $serverStates; + } + + /** + * @return string[] + * + * TODO better way of getting valid states (configurable states in the future?) + */ + protected function getValidServerStates() + { + $doc = Document::new(); + $field = $doc->getField('ServerState'); + return $field->getDefault(); + } +} diff --git a/library/Application/Console/Document/DuplicatesCommand.php b/library/Application/Console/Document/DuplicatesCommand.php new file mode 100644 index 000000000..ed6f64caa --- /dev/null +++ b/library/Application/Console/Document/DuplicatesCommand.php @@ -0,0 +1,238 @@ +--csv-report option is used to provide a file name, a CSV formatted +report is written, containing links to the documents found. The links depend on the 'url' +option being set in the configuration ('config.ini'). The columns in the CSV file are: + + DOI, Doc-ID, Link, Date Created, Server State + +Duplicate documents can be removed automatically using the --remove option. + +NOT SUPPORTED YET: +- Tagging and linking of duplicate documents for review by administrator + +EOT; + + $this->setName('document:duplicates') + ->setDescription('Removes duplicate documents by checking DOIs.') + ->setHelp($help) + ->addOption( + self::OPTION_DOI, + null, + InputOption::VALUE_REQUIRED, + 'One or more DOI values (CSV)' + ) + ->addOption( + self::OPTION_DOI_FILE, + null, + InputOption::VALUE_REQUIRED, + 'File containing DOIs (one per line)' + ) + ->addOption( + self::OPTION_DRYRUN, + null, + InputOption::VALUE_NONE, + 'Check DOIs without making changes' + ) + ->addOption( + self::OPTION_CSV_REPORT, + null, + InputOption::VALUE_REQUIRED, + 'Output file for CSV report' + ) + ->addOption( + self::OPTION_REMOVE, + null, + InputOption::VALUE_NONE, + 'Automatically remove newest duplicate document if UNPUBLISHED' + ); + } + + /** + * @return int + * @throws NotFoundException + */ + protected function execute(InputInterface $input, OutputInterface $output) + { + $finder = new Application_Document_DuplicateFinder(); + $finder->setOutput($output); + + if ($input->getOption(self::OPTION_DRYRUN)) { + $finder->setDryRunEnabled(true); + } + + if ($input->getOption(self::OPTION_REMOVE)) { + $finder->setRemoveEnabled(true); + } + + $doiValues = $this->getDoiInput($input, $output); + + if (count($doiValues) === 0) { + $output->writeln('Searching for duplicate DOI values in database ...'); + $doiValues = $this->getAllDuplicateDoiValues(); + } + + // Processing + + $doiCount = count($doiValues); + + if ($doiCount > 0) { + $csvPath = $input->getOption(self::OPTION_CSV_REPORT); + if ($csvPath !== null) { + $csvFile = fopen($csvPath, 'w'); + $finder->setCsvFile($csvFile); + } + + $output->writeln("Checking {$doiCount} DOI values"); + + $progressBar = null; + + if ($output->getVerbosity() === $output::VERBOSITY_NORMAL) { + $progressBar = new ProgressBar($output, $doiCount); + $finder->setProgressBar($progressBar); + } + + $finder->removeDuplicateDocuments($doiValues); + + if ($csvPath !== null) { + fclose($csvFile); + } + + if ($progressBar !== null) { + $progressBar->finish(); + $output->writeln(''); + } + } else { + $output->writeln('No DOI values found'); + } + + return 0; + } + + /** + * Reads DOI values from STDIN or file. + * + * @param InputInterface $input + * @param OutputInterface $output + * @return string[] + */ + protected function getDoiInput($input, $output) + { + $doi = $input->getOption(self::OPTION_DOI); + + if ($doi !== null) { + $doiInput = preg_replace('/,/', "\r\n", $doi); + } else { + $doiFile = $input->getOption(self::OPTION_DOI_FILE); + + if ($doiFile !== null) { + $doiInput = file_get_contents($doiFile); + } else { + if ($input instanceof StreamableInputInterface) { + $inputStream = $input->getStream(); + } + + $inputStream = $inputStream ?? STDIN; + + stream_set_blocking(STDIN, 0); + + $doiInput = stream_get_contents($inputStream); + } + } + + if (strlen(trim($doiInput)) > 0) { + $doiValues = preg_split("/((\r?\n)|(\r\n?))/", $doiInput); + $uniqueDoiValues = array_unique($doiValues); + + if (count($doiValues) !== count($uniqueDoiValues)) { + $output->writeln('Duplicates entries removed from DOI list.'); + } + + return $uniqueDoiValues; + } else { + return []; + } + } + + /** + * @return string[] + */ + protected function getAllDuplicateDoiValues() + { + $doiManager = DoiManager::getInstance(); + + return $doiManager->getDuplicateDoiValues(); + } +} diff --git a/library/Application/Document/DocumentDiff.php b/library/Application/Document/DocumentDiff.php new file mode 100644 index 000000000..e017bc369 --- /dev/null +++ b/library/Application/Document/DocumentDiff.php @@ -0,0 +1,241 @@ +toArray(); + $keys = array_unique(array_merge($keys, array_keys($docData))); + $metadata[$docId] = $docData; + } + + $maxDocId = max(array_keys($metadata)); + $maxDigits = strlen((string) $maxDocId); + + sort($keys); + + $differences = []; + + foreach ($keys as $key) { + $values = []; + + foreach ($metadata as $docId => $docData) { + if (isset($docData[$key])) { + $value = $docData[$key]; + } else { + $value = null; + } + + $values[$docId] = $value; + } + + $previousValue = null; + $firstValue = true; + + foreach ($values as $value) { + if ($firstValue) { + $previousValue = $value; + $firstValue = false; + } else { + if ($previousValue !== $value) { + $differences[$key] = $values; + } + } + } + } + + $diffKeys = array_keys($differences); + + $output = $this->getOutput(); + + foreach ($diffKeys as $key) { + $output->writeln("Field: $key"); + $values = $differences[$key]; + $field = $doc->getField($key); + $valueModelClass = $field->getValueModelClass(); + $linkModelClass = $field->getLinkModelClass(); + + foreach ($values as $docId => $value) { + $output->writeln('------------------------------------------------------------'); + if (! is_array($value)) { + $line = sprintf(" %{$maxDigits}d: %s", $docId, $value); + $output->writeln($line); + } else { + if (count($value) > 0) { + if ($valueModelClass !== null) { + if ($field->hasMultipleValues()) { + foreach ($value as $subValue) { + $this->renderModel($key, $valueModelClass, $linkModelClass, $subValue, $docId, $maxDigits); + } + } else { + $this->renderModel($key, $valueModelClass, $linkModelClass, $value, $docId, $maxDigits); + } + } + } else { + $line = sprintf(" %{$maxDigits}d: %s", $docId, '-'); + $output->writeln($line); + } + } + } + $output->writeln('------------------------------------------------------------'); + $output->writeln(''); + } + } + + /** + * @param string $key + * @param string|null $valueModelClass + * @param string|null $linkModelClass + * @param mixed $value + * @param int $docId + * @param int $maxDigits + */ + protected function renderModel($key, $valueModelClass, $linkModelClass, $value, $docId, $maxDigits) + { + if (strpos($key, 'Person') === 0) { + unset($value['DateOfBirth']); // TODO date field causes problems with fromArray + } + + if ($key === 'File') { + unset($value['ServerDateSubmitted']); // TODO date field causes problems with fromArray + } + + if ($linkModelClass !== null) { + $model = $linkModelClass::fromArray($value); + } else { + $model = $valueModelClass::fromArray($value); + } + + switch ($key) { + case 'Collection': + $displayName = $model->getDisplayName(); + break; + + case 'EmbargoDate': + case 'CompletedDate': + case 'PublishedDate': + case 'ServerDatePublished': + case 'ServerDateCreated': + case 'ServerDateModified': + case 'ThesisDateAccepted': + $displayName = (string) $model; + break; + + case 'Enrichment': + $displayName = '' . $model->getKeyName() . ': ' . $model->getValue(); + break; + + case 'Identifier': + $displayName = '' . $model->getType() . ': ' . $model->getValue(); + break; + + case 'TitleMain': + case 'TitleAbstract': + case 'TitleAdditional': + case 'TitleParent': + case 'TitleSub': + $displayName = '(' . $model->getLanguage() . ') ' . $model->getValue(); + break; + + case 'Note': + $displayName = $model->getMessage(); + break; + + case 'Patent': + $displayName = $model->getNumber() . ' - ' . $model->getApplication(); + break; + + case 'Subject': + $displayName = $model->getValue() . ' (' . $model->getType() . ')'; + break; + + case 'Person': + $displayName = $model->getDisplayName() . ' (' . $model->getRole() . ')'; + break; + + case 'File': + $displayName = $model->getLabel() . ' (' . $model->getMimeType() . ')'; + break; + + default: + $displayName = $model->getDisplayName(); + } + + $line = sprintf(" %{$maxDigits}d: %s", $docId, $displayName); + $this->getOutput()->writeln($line); + } + + /** + * @param OutputInterface $output + * @return $this + */ + public function setOutput($output) + { + $this->output = $output; + return $this; + } + + /** + * @return OutputInterface + */ + public function getOutput() + { + if ($this->output === null) { + $this->output = new ConsoleOutput(); + } + return $this->output; + } +} diff --git a/library/Application/Document/DuplicateFinder.php b/library/Application/Document/DuplicateFinder.php new file mode 100644 index 000000000..fac639b60 --- /dev/null +++ b/library/Application/Document/DuplicateFinder.php @@ -0,0 +1,370 @@ + need to generate a report (CSV) + */ +class Application_Document_DuplicateFinder +{ + /** @var OutputInterface */ + private $output; + + /** @var ProgressBar */ + private $progressBar; + + /** @var resource */ + private $csvFile; + + /** @var bool */ + private $dryRun; + + /** @var string */ + private $fromDate; + + /** @var string */ + private $untilDate; + + /** @var bool */ + private $remove; + + /** + * @param string[] $listOfDoi + */ + public function removeDuplicateDocuments($listOfDoi) + { + $progressBar = $this->getProgressBar(); + foreach ($listOfDoi as $doi) { + $this->removeDuplicateDocument($doi); + if ($progressBar) { + $progressBar->advance(); + } + } + } + + /** + * @param string $doi + */ + public function removeDuplicateDocument($doi) + { + $output = $this->getOutput(); + + $output->write("Checking $doi ... ", false, OutputInterface::VERBOSITY_VERBOSE); + + $docIds = $this->findDocuments($doi); + + $docCount = count($docIds); + + $output->write("found {$docCount} document(s)", false, OutputInterface::VERBOSITY_VERBOSE); + + if (count($docIds) > 1) { + if ($output->isVerbose()) { + $output->write(' - ' . implode(', ', $docIds) . ' '); + } + + foreach ($docIds as $docId) { + $doc = Document::get($docId); + $this->writeCsv($doi, $doc); + } + + // TODO log if more than 2 documents were found + $doc = $this->getNewestDocument($docIds); + $docId = $doc->getId(); + $serverState = $doc->getServerState(); + + if ($doc->getServerState() === Document::STATE_UNPUBLISHED) { + if ($output->isVerbose()) { + $output->write("- REMOVE document {$docId}"); + } + $this->performAction($doc); + } else { + $output->write( + "- KEEP document {$docId} in state '{$serverState}'", + false, + OutputInterface::VERBOSITY_VERBOSE + ); + } + } else { + $this->writeCsv($doi); + } + + $output->writeln('', OutputInterface::VERBOSITY_VERBOSE); + } + + /** + * @param string $doi + * @return int[] + */ + public function findDocuments($doi) + { + $finder = Repository::getInstance()->getDocumentFinder(); + + $finder->setIdentifierValue('doi', $doi); + + return $finder->getIds(); + } + + /** + * @param int[] $docIds + * @return DocumentInterface + */ + public function getNewestDocument($docIds) + { + $doc = null; + + foreach ($docIds as $docId) { + $nextDoc = Document::get($docId); + if ($doc === null) { + $doc = $nextDoc; + } else { + switch ($doc->getServerDateCreated()->compare($nextDoc->getServerDateCreated())) { + case -1: + $doc = $nextDoc; + break; + + case 0: + // if ServerDateCreated is the same keep the document with the higher database ID + if ($doc->getId() < $nextDoc->getId()) { + $doc = $nextDoc; + } + break; + + default: + break; + } + } + } + + return $doc; + } + + /** + * @param string $doi + * @param DocumentInterface|null $doc + */ + protected function writeCsv($doi, $doc = null) + { + $csvFile = $this->getCsvFile(); + + if ($csvFile !== null) { + if ($doc !== null) { + $baseLink = $this->getBaseLink(); + $docId = $doc->getId(); + $dateCreated = $doc->getServerDateCreated()->getDateTime(); + + $data = [ + $doi, + $docId, + "<$baseLink/$docId>", + $dateCreated->format('Y-m-d'), + $doc->getServerState(), + ]; + } else { + $data = [ + $doi, + 'NOT_FOUND', + '', + '', + '', + ]; + } + + fputcsv($csvFile, $data); + } + } + + /** + * @return string|null + */ + protected function getBaseLink() + { + $config = Config::get(); + + if (isset($config->url)) { + $url = $config->url ?? ''; + return rtrim($url, "/ \n\r\t\v\x00"); + } + + return null; + } + + /** + * @param DocumentInterface $doc + * + * TODO move action into separate classes for different actions (report, mark, delete, ...) + */ + protected function performAction($doc) + { + if (! $this->isDryRunEnabled() && $this->isRemoveEnabled()) { + $doc->delete(); + } + } + + /** + * @param bool $enabled + */ + public function setDryRunEnabled($enabled) + { + $this->dryRun = $enabled; + } + + /** + * @return bool + */ + public function isDryRunEnabled() + { + return $this->dryRun; + } + + /** + * @param OutputInterface $output + */ + public function setOutput($output) + { + $this->output = $output; + } + + /** + * @return OutputInterface + */ + public function getOutput() + { + if ($this->output === null) { + $this->output = new ConsoleOutput(); + } + return $this->output; + } + + /** + * @param string $fromDate + * @return $this + */ + public function setFromDate($fromDate) + { + $this->fromDate = $fromDate; + + return $this; + } + + /** + * @return string + */ + public function getFromDate() + { + return $this->fromDate; + } + + /** + * @param string $untilDate + * @return $this + */ + public function setUntilDate($untilDate) + { + $this->untilDate = $untilDate; + + return $this; + } + + /** + * @return string + */ + public function getUntilDate() + { + return $this->untilDate; + } + + /** + * @param bool $enabled + * @return $this + */ + public function setRemoveEnabled($enabled) + { + $this->remove = $enabled; + + return $this; + } + + /** + * @return bool + */ + public function isRemoveEnabled() + { + return $this->remove; + } + + /** + * @param resource $csvFile + * @return $this + */ + public function setCsvFile($csvFile) + { + $this->csvFile = $csvFile; + return $this; + } + + /** + * @return resource|null + */ + public function getCsvFile() + { + return $this->csvFile; + } + + /** + * @param ProgressBar|null $progressBar + * @return $this + */ + public function setProgressBar($progressBar) + { + $this->progressBar = $progressBar; + return $this; + } + + /** + * @return ProgressBar|null + */ + public function getProgressBar() + { + return $this->progressBar; + } +} diff --git a/tests/library/Application/Console/Document/DiffCommandTest.php b/tests/library/Application/Console/Document/DiffCommandTest.php new file mode 100644 index 000000000..dc4d0ceb5 --- /dev/null +++ b/tests/library/Application/Console/Document/DiffCommandTest.php @@ -0,0 +1,41 @@ +markTestIncomplete('TODO testing'); + } +} diff --git a/tests/library/Application/Console/Document/DuplicatesCommandTest.php b/tests/library/Application/Console/Document/DuplicatesCommandTest.php new file mode 100644 index 000000000..c795505dd --- /dev/null +++ b/tests/library/Application/Console/Document/DuplicatesCommandTest.php @@ -0,0 +1,41 @@ +markTestIncomplete('TODO testing'); + } +} diff --git a/tests/library/Application/Document/DocumentDiffTest.php b/tests/library/Application/Document/DocumentDiffTest.php new file mode 100644 index 000000000..85fe204fd --- /dev/null +++ b/tests/library/Application/Document/DocumentDiffTest.php @@ -0,0 +1,38 @@ +markTestIncomplete('TODO testing'); + } +} diff --git a/tests/library/Application/Document/DuplicateFinderTest.php b/tests/library/Application/Document/DuplicateFinderTest.php new file mode 100644 index 000000000..606cca9c4 --- /dev/null +++ b/tests/library/Application/Document/DuplicateFinderTest.php @@ -0,0 +1,162 @@ +helper = new Application_Document_DuplicateFinder(); + $this->helper->setOutput(new NullOutput()); + $this->helper->setRemoveEnabled(true); + + $this->setupTestDocuments(); + } + + public function testFindDocuments() + { + $helper = $this->helper; + + $doi = '10.1000/182'; + + $docIds = $helper->findDocuments($doi); + + $this->assertCount(2, $docIds); + } + + public function testFindDocumentsUnknownDoi() + { + $helper = $this->helper; + + $doi = '10.1000/282'; + + $docIds = $helper->findDocuments($doi); + + $this->assertCount(0, $docIds); + } + + public function testRemoveNewerDuplicateDocument() + { + $this->helper->removeDuplicateDocument('10.1000/182'); + + Document::get($this->docId3); + + $this->expectException(NotFoundException::class); + + Document::get($this->docId2); + } + + public function testDoNotRemoveNewerDuplicateDocumentIfDryRunEnabled() + { + $helper = $this->helper; + + $helper->setDryRunEnabled(true); + $helper->removeDuplicateDocument('10.1000/182'); + + Document::get($this->docId2); + Document::get($this->docId1); + } + + public function testRemoveOnlyUnpublishedDocuments() + { + $doc = Document::get($this->docId2); + $doc->setServerState(Document::STATE_INPROGRESS); + $doc->store(); + + $this->helper->removeDuplicateDocument('10.1000/182'); + + Document::get($this->docId2); + Document::get($this->docId1); + Document::get($this->docId3); + } + + public function testGetNewestDocument() + { + $docIds = [$this->docId1, $this->docId2]; + + $doc = $this->helper->getNewestDocument($docIds); + + $this->assertNotNull($doc); + $this->assertEquals($this->docId2, $doc->getId()); + + $docIds = [$this->docId2, $this->docId1]; + + $doc = $this->helper->getNewestDocument($docIds); + + $this->assertNotNull($doc); + $this->assertEquals($this->docId2, $doc->getId()); + } + + protected function setupTestDocuments() + { + $doc = $this->createTestDocument(); + $doi = $doc->addIdentifierDoi(); + $doi->setValue('10.1000/182'); + $this->docId1 = $doc->store(); + + sleep(1); + + $doc = $this->createTestDocument(); + $doi = $doc->addIdentifierDoi(); + $doi->setValue('10.1000/182'); + $this->docId2 = $doc->store(); + + $doc = $this->createTestDocument(); + $doi = $doc->addIdentifierDoi(); + $doi->setValue('10.1000/183'); + $this->docId3 = $doc->store(); + } +}