Skip to content

Commit

Permalink
Add pipeline forging
Browse files Browse the repository at this point in the history
  • Loading branch information
kleag committed May 7, 2024
1 parent a654b71 commit fbbcd04
Show file tree
Hide file tree
Showing 14 changed files with 377 additions and 155 deletions.
80 changes: 75 additions & 5 deletions lima_common/conf/lima-analysis.xml
Original file line number Diff line number Diff line change
Expand Up @@ -511,11 +511,81 @@ SPDX-License-Identifier: MIT
<entry key="fre" value="xml"/>
<entry key="por" value="xml"/>
</map>
<!-- The none pipeline is an empty one allowing to not do anything. For tests mainly. -->
<map name="none">
<entry key="eng" value="none"/>
<entry key="fre" value="none"/>
<entry key="por" value="none"/>
<!-- The empty pipeline is an empty one allowing to not do anything. For tests mainly. -->
<map name="empty">
<entry key="eng" value="empty"/>
<entry key="fre" value="empty"/>
<entry key="por" value="empty"/>

<entry key="ud" value="empty"/>

<entry key="ud-afr" value="empty" />
<entry key="ud-ara" value="empty" />
<entry key="ud-bel" value="empty" />
<entry key="ud-bre" value="empty" />
<entry key="ud-bul" value="empty" />
<entry key="ud-cat" value="empty" />
<entry key="ud-ces" value="empty" />
<entry key="ud-chu" value="empty" />
<entry key="ud-cop" value="empty" />
<entry key="ud-cym" value="empty" />
<entry key="ud-dan" value="empty" />
<entry key="ud-deu" value="empty" />
<entry key="ud-ell" value="empty" />
<entry key="ud-eng" value="empty" />
<entry key="ud-est" value="empty" />
<entry key="ud-eus" value="empty" />
<entry key="ud-fas" value="empty" />
<entry key="ud-fin" value="empty" />
<entry key="ud-fra" value="empty" />
<entry key="ud-fro" value="empty" />
<entry key="ud-gla" value="empty" />
<entry key="ud-gle" value="empty" />
<entry key="ud-glg" value="empty" />
<entry key="ud-heb" value="empty" />
<entry key="ud-hin" value="empty" />
<entry key="ud-hrv" value="empty" />
<entry key="ud-hun" value="empty" />
<entry key="ud-hye" value="empty" />
<entry key="ud-ind" value="empty" />
<entry key="ud-ita" value="empty" />
<entry key="ud-jpn" value="empty" />
<entry key="ud-kaz" value="empty" />
<entry key="ud-kmr" value="empty" />
<entry key="ud-kor" value="empty" />
<entry key="ud-lat" value="empty" />
<entry key="ud-lav" value="empty" />
<entry key="ud-lit" value="empty" />
<entry key="ud-lzh" value="empty" />
<entry key="ud-mar" value="empty" />
<entry key="ud-mlt" value="empty" />
<entry key="ud-nld" value="empty" />
<entry key="ud-nno" value="empty" />
<entry key="ud-nob" value="empty" />
<entry key="ud-orv" value="empty" />
<entry key="ud-pcm" value="empty" />
<entry key="ud-pol" value="empty" />
<entry key="ud-por" value="empty" />
<entry key="ud-ron" value="empty" />
<entry key="ud-rus" value="empty" />
<entry key="ud-slk" value="empty" />
<entry key="ud-slv" value="empty" />
<entry key="ud-sme" value="empty" />
<entry key="ud-spa" value="empty" />
<entry key="ud-sqi" value="empty" />
<entry key="ud-srp" value="empty" />
<entry key="ud-swe" value="empty" />
<entry key="ud-tam" value="empty" />
<entry key="ud-tel" value="empty" />
<entry key="ud-tha" value="empty" />
<entry key="ud-tur" value="empty" />
<entry key="ud-uig" value="empty" />
<entry key="ud-ukr" value="empty" />
<entry key="ud-urd" value="empty" />
<entry key="ud-vie" value="empty" />
<entry key="ud-wol" value="empty" />
<entry key="ud-zho-simp" value="empty" />
<entry key="ud-zho" value="empty" />
</map>
</group>
<group name="analysisDumpers">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class ProcessUnitPipeline : public ProcessUnit
const std::string& unitId,
typename ProcessUnit::Manager* manager);

void push_back(std::shared_ptr<ProcessUnit> pu);

private:
void debugPrintInactiveUnits() const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ void ProcessUnitPipeline<ProcessUnit>::push_back(
m_processUnitSequence.push_back(pu);
}

template <typename ProcessUnit>
void ProcessUnitPipeline<ProcessUnit>::push_back(std::shared_ptr<ProcessUnit> pu)
{
m_processUnitSequence.push_back(pu);
}

template <typename ProcessUnit>
LimaStatusCode ProcessUnitPipeline<ProcessUnit>::process(
AnalysisContent& analysis) const
Expand Down
2 changes: 1 addition & 1 deletion lima_linguisticprocessing/conf/lima-lp-ud-eng.xml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ SPDX-License-Identifier: MIT
</list>
</group>

<group name="none" class="ProcessUnitPipeline">
<group name="empty" class="ProcessUnitPipeline">
<list name="processUnitSequence"/>
</group>
<!-- ******************************************
Expand Down
2 changes: 1 addition & 1 deletion lima_linguisticprocessing/conf/lima-lp-ud.xml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ SPDX-License-Identifier: MIT
</list>
</group>

<group name="none" class="ProcessUnitPipeline">
<group name="empty" class="ProcessUnitPipeline">
<list name="processUnitSequence"/>
</group>
<!-- ******************************************
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,80 @@ add_subdirectory(xmlreader)

add_definitions(-DLIMA_LINGUISTICPROCESSIONGCLIENT_EXPORTING)

########### next target ###############

SET(lima-lp-analyzer_LIB_SRCS
lima.cpp
)

add_library(lima-lp-analyzer SHARED ${lima-lp-analyzer_LIB_SRCS})
ADD_DLL_TO_PATH(lima-lp-analyzer)


target_link_libraries(lima-lp-analyzer
lima-linguisticprocessing
lima-common-datahandler
lima-common-data
lima-common-factory
lima-common-fsaaccess
lima-common-mediaprocessors
lima-common-mediaticdata
lima-common-misc
lima-common-processunitframework
lima-common
lima-common-time
lima-common-tools
lima-common-xmlconfigurationfiles
lima-linguisticprocessing
lima-lp-analysisdict
lima-lp-analysisdumpers
lima-lp-analysishandlers
lima-lp-applyrecognizer
lima-lp-automatoncompiler
lima-lp-automaton
lima-lp-bagofwords
lima-lp-client
lima-lp-compounds
lima-lp-conllureader
lima-lp-corefsolver
lima-lp-dictionary
lima-lp-entitytracker
lima-lp-eventanalyzer
lima-lp-flattokenizer
lima-lp-helpers
lima-lp-langdetector
lima-lp-lineartextrepresentation
lima-lp-linguisticanalysisstructure
lima-lp-linguisticdata
lima-lp-linguisticprocessing-core
lima-lp-linguisticprocessors
lima-lp-linguisticresources
lima-lp-misc
lima-lp-morphologicanalysis
lima-lp-postagger
lima-lp-propertycode
lima-lp-regexmatcher
lima-lp-se-datetime
lima-lp-semanticanalysis
lima-lp-se-number
lima-lp-se-person
lima-lp-specificentities
lima-lp-syntacticanalysis
lima-lp-textsegmentation
lima-lp-tgv
lima-lp-wordsensetagger
lima-xp-client
lima-xp-core
lima-xp-documentsreader
lima-xp-qtsgmlentities
)

set_target_properties(lima-lp-analyzer PROPERTIES VERSION ${LIMA_LP_LIB_VERSION} SOVERSION ${LIMA_LP_LIB_SOVERSION})

install(TARGETS lima-lp-analyzer DESTINATION ${LIB_INSTALL_DIR}
COMPONENT runtime)


########### next target ###############

SET(lima-lp-client_LIB_SRCS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,15 @@
#include "common/LimaVersion.h"
#include "common/Data/strwstrtools.h"
#include "common/MediaticData/mediaticData.h"
#include "common/MediaProcessors/MediaProcessors.h"
#include "common/MediaProcessors/MediaProcessUnit.h"
#include <common/ProcessUnitFramework/AnalysisContent.h>
#include "common/QsLog/QsLog.h"
#include "common/QsLog/QsLogDest.h"
#include "common/QsLog/QsLogCategories.h"
#include "common/QsLog/QsDebugOutput.h"
#include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h"
#include "common/XMLConfigurationFiles/groupConfigurationStructure.h"
#include "common/time/traceUtils.h"
#include "common/tools/FileUtils.h"
#include "common/tools/LimaMainTaskRunner.h"
Expand Down Expand Up @@ -117,6 +119,7 @@ using LangData = Lima::Common::MediaticData::LanguageData;
using MedData = Lima::Common::MediaticData::MediaticData ;
using namespace Lima::Common::Misc;
using namespace Lima::Common::PropertyCode;
using namespace Lima::Common::XMLConfigurationFiles;
using namespace Lima;

struct character_escaper
Expand Down Expand Up @@ -157,6 +160,10 @@ class LimaAnalyzerPrivate

void initMetaData();

bool addPipelineUnit(const std::string& pipeline,
const std::string& media,
const std::string& jsonGroupString);

const std::string analyzeText(const std::string& text,
const std::string& lang,
const std::string& pipeline,
Expand Down Expand Up @@ -184,6 +191,7 @@ class LimaAnalyzerPrivate
/** Reset all members used to store analysis states. To be called before handling a new analysis. */
void reset();

QJsonObject objectFromString(const QString& in);

QString previousNeType;

Expand Down Expand Up @@ -324,7 +332,7 @@ LimaAnalyzerPrivate::LimaAnalyzerPrivate(const QStringList& iqlangs,
// << clientId << std::endl;

// initialize linguistic processing
Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(
XMLConfigurationFileParser lpconfig(
(configDir + "/" + lpConfigFile.c_str()));
LinguisticProcessingClientFactory::changeable().configureClientFactory(
clientId,
Expand Down Expand Up @@ -475,6 +483,71 @@ void LimaAnalyzerPrivate::initMetaData ()
}
}

bool LimaAnalyzer::addPipelineUnit(const std::string& pipeline,
const std::string& media,
const std::string& jsonGroupString)
{
if (! m_d->addPipelineUnit(pipeline, media, jsonGroupString))
{
error = true;
errorMessage = "addPipelineUnit: failed";
return false;
}
return true;
}

QJsonObject LimaAnalyzerPrivate::objectFromString(const QString& in)
{
QJsonObject obj;

QJsonDocument doc = QJsonDocument::fromJson(in.toUtf8());

// check validity of the document
if(!doc.isNull())
{
if(doc.isObject())
{
obj = doc.object();
}
else
{
qDebug() << "Document is not an object" << Qt::endl;
}
}
else
{
qDebug() << "Invalid JSON...\n" << in << Qt::endl;
}

return obj;
}

bool LimaAnalyzerPrivate::addPipelineUnit(const std::string& pipeline,
const std::string& media,
const std::string& jsonGroupString)
{
auto jsonGroup = QJsonDocument::fromJson(QByteArray::fromStdString(jsonGroupString)).object();
auto mediaid = Lima::Common::MediaticData::MediaticData::single().getMediaId(media);
auto pipe = MediaProcessors::changeable().getPipelineForId(mediaid, pipeline);
auto managers = Lima::MediaProcessors::single().managers();
// QString jsonGroupString =
// "{ \"name\":\"cpptftokenizer\", "
// " \"class\":\"CppUppsalaTensorFlowTokenizer\", "
// " \"model_prefix\": \"tokenizer-eng\" }";

GroupConfigurationStructure unitConfig(jsonGroup);

// managers (Manager*) used in init of ProcessUnits are stored in
// m_pipelineManagers of MediaProcessors
auto pu = MediaProcessUnit::Factory::getFactory(
jsonGroup["class"].toString().toStdString())->create(
unitConfig,
managers[mediaid]);
pipe->push_back(pu);

return true;
}

std::string LimaAnalyzer::analyzeText(const std::string& text,
const std::string& lang,
const std::string& pipeline,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ class LimaAnalyzer
const std::string& pipeline="",
const std::string& meta="");

bool addPipelineUnit(const std::string& pipeline,
const std::string& media,
const std::string& jsonGroupString);

bool error = false;
std::string errorMessage = "";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ namespace LinguisticProcessing
/**
@author Benoit Mathieu
*/
class LIMA_CORELINGUISTICPROCESSINGCLIENT_EXPORT CoreLinguisticProcessingClient : public AbstractLinguisticProcessingClient
class LIMA_CORELINGUISTICPROCESSINGCLIENT_EXPORT CoreLinguisticProcessingClient :
public AbstractLinguisticProcessingClient
{
public:
CoreLinguisticProcessingClient();
Expand All @@ -34,15 +35,16 @@ class LIMA_CORELINGUISTICPROCESSINGCLIENT_EXPORT CoreLinguisticProcessingClient
const std::map<std::string,std::string>& metaData,
const std::string& pipeline,
const std::map<std::string, AbstractAnalysisHandler*>& handlers,
const std::set<std::string>& inactiveUnits = std::set<std::string>()) const override;
const std::set<std::string>& inactiveUnits = std::set<std::string>()
) const override;

std::shared_ptr<AnalysisContent> analyze(
const std::string& texte,
const std::map<std::string,std::string>& metaData,
const std::string& pipeline,
const std::map<std::string, AbstractAnalysisHandler*>& handlers,
const std::set<std::string>& inactiveUnits = std::set<std::string>()) const override
;
const std::set<std::string>& inactiveUnits = std::set<std::string>()
) const override;
};

class CoreLinguisticProcessingClientFactory : public AbstractLinguisticProcessingClientFactory
Expand Down
Loading

0 comments on commit fbbcd04

Please sign in to comment.