ApryseSDK
diff --git a/‎Samples/DataExtractionTest/GO/DataExtraction_test.go‎
Lines changed: 63 additions & 0 deletions b/‎Samples/DataExtractionTest/GO/DataExtraction_test.go‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎Samples/DataExtractionTest/PHP/DataExtractionTest.php‎
Lines changed: 54 additions & 2 deletions b/‎Samples/DataExtractionTest/PHP/DataExtractionTest.php‎
Lines changed: 54 additions & 2 deletions
diff --git a/‎Samples/DataExtractionTest/PYTHON/DataExtractionTest.py‎
Lines changed: 49 additions & 0 deletions b/‎Samples/DataExtractionTest/PYTHON/DataExtractionTest.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎Samples/DataExtractionTest/RUBY/DataExtractionTest.rb‎
Lines changed: 56 additions & 0 deletions b/‎Samples/DataExtractionTest/RUBY/DataExtractionTest.rb‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎Samples/TestFiles/Email.pdf‎
188 KB b/‎Samples/TestFiles/Email.pdf‎
188 KB
diff --git a/‎Samples/TestFiles/Invoice.pdf‎
488 KB b/‎Samples/TestFiles/Invoice.pdf‎
488 KB
diff --git a/‎Samples/TestFiles/Scientific_Publication.pdf‎
269 KB b/‎Samples/TestFiles/Scientific_Publication.pdf‎
269 KB
@@ -302,6 +302,60 @@ func GenericKeyValueTest() (err error) {
 	return nil
 }
 
+//---------------------------------------------------------------------------------------
+// The following sample illustrates how to extract document classes from PDF documents.
+//---------------------------------------------------------------------------------------
+
+func DocClassifierTest() (err error) {
+	defer catch(&err)
+
+	// Test if the add-on is installed
+	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocClassification) {
+		fmt.Println("")
+		fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
+		fmt.Println("-----------------------------------------------------------------------------")
+		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
+		fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
+		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
+		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
+		fmt.Println("")
+		return nil
+	}
+
+	// Simple example: classify pages as a JSON file
+	fmt.Println("Classify pages as a JSON file")
+
+	inputFile := inputPath + "Invoice.pdf"
+	outputFile := outputPath + "Invoice_Classified.json"
+	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification)
+
+	fmt.Println("Result saved in " + outputFile)
+
+	// Classify pages as a JSON string
+	fmt.Println("Classify pages as a JSON string")
+
+	inputFile = inputPath + "Scientific_Publication.pdf"
+	outputFile = outputPath + "Scientific_Publication_Classified.json"
+	json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocClassification).(string)
+	WriteTextToFile(outputFile, json)
+
+	fmt.Println("Result saved in " + outputFile)
+
+	// Example with customized options:
+	fmt.Println("Classify pages with customized options")
+
+	inputFile = inputPath + "Email.pdf"
+	outputFile = outputPath + "Email_Classified.json"
+	options := NewDataExtractionOptions()
+	// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
+	options.SetMinimumConfidenceThreshold(0.7)
+	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification, options)
+
+	fmt.Println("Result saved in " + outputFile)
+
+	return nil
+}
+
 //---------------------------------------------------------------------------------------
 
 func TestDataExtraction(t *testing.T) {
@@ -335,13 +389,22 @@ func TestDataExtraction(t *testing.T) {
 		fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
 	}
 
+	//-----------------------------------------------------------------------------------
+
 	err = GenericKeyValueTest()
 	if err != nil {
 		fmt.Println(fmt.Errorf("Unable to extract key-value pairs, error: %s", err))
 	}
 
 	//-----------------------------------------------------------------------------------
 
+	err = DocClassifierTest()
+	if err != nil {
+		fmt.Println(fmt.Errorf("Unable to extract document classifications, error: %s", err))
+	}
+
+	//-----------------------------------------------------------------------------------
+
 	PDFNetTerminate()
 	fmt.Println("Done.")
 }
@@ -212,7 +212,7 @@ function main()
 	}
 
 	//////////////////////////////////////////////////////////////////////////
-	// The following sample illustrates how to extract document structure from PDF documents.
+	// The following sample illustrates how to extract key-value pairs from PDF documents.
 	//////////////////////////////////////////////////////////////////////////
 
 	// Test if the add-on is installed
@@ -239,7 +239,7 @@ function main()
 			// Example with customized options:
 			// Extract Keys & Values from pages 2-4, excluding ads
 			$options = new DataExtractionOptions();
-			$options->setPages("2-4");
+			$options->SetPages("2-4");
 
 			$p2ExclusionZones = new RectCollection();
 			// Exclude the ad on page 2
@@ -267,6 +267,58 @@ function main()
 		}
 	}
 
+	//////////////////////////////////////////////////////////////////////////
+	// The following sample illustrates how to extract document classes from PDF documents.
+	//////////////////////////////////////////////////////////////////////////
+
+	// Test if the add-on is installed
+	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocClassification)) {
+		echo(nl2br("\n"));
+		echo(nl2br("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.\n"));
+		echo(nl2br("-----------------------------------------------------------------------------\n"));
+		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
+		echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
+		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
+		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
+		echo(nl2br("\n"));
+	}
+	else {
+		try {
+			// Simple example: classify pages as a JSON file
+			echo(nl2br("Classify pages as a JSON file\n"));
+
+			$outputFile = $outputPath."Invoice_Classified.json";
+			DataExtractionModule::ExtractData($inputPath."Invoice.pdf", $outputFile, DataExtractionModule::e_DocClassification);
+
+			echo(nl2br("Result saved in " . $outputFile . "\n"));
+
+			///////////////////////////////////////////////////////
+			// Classify pages as a JSON string
+			echo(nl2br("Classify pages as a JSON string\n"));
+
+			$outputFile = $outputPath."Scientific_Publication_Classified.json";
+			$json = DataExtractionModule::ExtractData($inputPath."Scientific_Publication.pdf", DataExtractionModule::e_DocClassification);
+			WriteTextToFile($outputFile, $json);
+
+			echo(nl2br("Result saved in " . $outputFile . "\n"));
+
+			///////////////////////////////////////////////////////
+			// Example with customized options:
+			echo(nl2br("Classify pages with customized options\n"));
+
+			$options = new DataExtractionOptions();
+			// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
+			$options->SetMinimumConfidenceThreshold(0.7);
+			$outputFile = $outputPath."Email_Classified.json";
+			DataExtractionModule::ExtractData($inputPath."Email.pdf", $outputFile, DataExtractionModule::e_DocClassification, $options);
+
+			echo(nl2br("Result saved in " . $outputFile . "\n"));
+		}
+		catch(Exception $e) {
+			echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
+		}
+	}
+
 	//-----------------------------------------------------------------------------------
 
 	PDFNet::Terminate();
 
@@ -252,6 +252,55 @@ def main():
                 print("Unable to extract key-value data, error: " + str(e))
 
 
+    #-----------------------------------------------------------------------------------
+    # The following sample illustrates how to extract document classes from PDF documents.
+    #-----------------------------------------------------------------------------------
+
+    # Test if the add-on is installed
+    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocClassification):
+        print("")
+        print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
+        print("-----------------------------------------------------------------------------")
+        print("The Data Extraction suite is an optional add-on, available for download")
+        print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
+        print("downloaded this module, ensure that the SDK is able to find the required files")
+        print("using the PDFNet.AddResourceSearchPath() function.")
+        print("")
+    else:
+        try:
+            # Simple example: classify pages as a JSON file
+            print("Classify pages as a JSON file")
+
+            outputFile = outputPath + "Invoice_Classified.json"
+            DataExtractionModule.ExtractData(inputPath + "Invoice.pdf", outputFile, DataExtractionModule.e_DocClassification)
+
+            print("Result saved in " + outputFile)
+
+            #------------------------------------------------------
+            # Classify pages as a JSON string
+            print("Classify pages as a JSON string")
+
+            outputFile = outputPath + "Scientific_Publication_Classified.json"
+            json = DataExtractionModule.ExtractData(inputPath + "Scientific_Publication.pdf", DataExtractionModule.e_DocClassification)
+            WriteTextToFile(outputFile, json)
+
+            print("Result saved in " + outputFile)
+
+            #------------------------------------------------------
+            # Example with customized options:
+            print("Classify pages with customized options")
+
+            options = DataExtractionOptions()
+            # Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
+            options.SetMinimumConfidenceThreshold(0.7)
+            outputFile = outputPath + "Email_Classified.json"
+            DataExtractionModule.ExtractData(inputPath + "Email.pdf", outputFile, DataExtractionModule.e_DocClassification, options)
+
+            print("Result saved in " + outputFile)
+
+        except Exception as e:
+            print("Unable to extract document structure data, error: " + str(e))
+
     PDFNet.Terminate()
     print("Done.")
 
 
@@ -200,6 +200,11 @@ def main()
 		end
 	end
 
+	#-----------------------------------------------------------------------------------
+	# The following sample illustrates how to extract key-value pairs from PDF documents.
+	#-----------------------------------------------------------------------------------
+
+	# Test if the add-on is installed
 	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_GenericKeyValue) then
 		puts ""
 		puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
@@ -244,6 +249,57 @@ def main()
 		end
 	end
 
+	#-----------------------------------------------------------------------------------
+	# The following sample illustrates how to extract document classes from PDF documents.
+	#-----------------------------------------------------------------------------------
+
+	# Test if the add-on is installed
+	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocClassification) then
+		puts ""
+		puts "Unable to run Data Extraction: PDFTron SDK Structured Output module not available."
+		puts "-----------------------------------------------------------------------------"
+		puts "The Data Extraction suite is an optional add-on, available for download"
+		puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
+		puts "downloaded this module, ensure that the SDK is able to find the required files"
+		puts "using the PDFNet.AddResourceSearchPath() function."
+		puts ""
+	else
+		begin
+			# Simple example: classify pages as a JSON file
+			puts "Classify pages as a JSON file"
+	
+			outputFile = $outputPath + "Invoice_Classified.json"
+			DataExtractionModule.ExtractData($inputPath + "Invoice.pdf", outputFile, DataExtractionModule::E_DocClassification)
+
+			puts "Result saved in " + outputFile
+
+			#------------------------------------------------------
+			# Classify pages as a JSON string
+			puts "Classify pages as a JSON string"
+	
+			outputFile = $outputPath + "Scientific_Publication_Classified.json"
+			json = DataExtractionModule.ExtractData($inputPath + "Scientific_Publication.pdf", DataExtractionModule::E_DocClassification)
+			File.open(outputFile, 'w') { |file| file.write(json) }
+	
+			puts "Result saved in " + outputFile
+
+			#------------------------------------------------------
+			# Example with customized options:
+			puts "Classify pages with customized options"
+	
+			options = DataExtractionOptions.new()
+			# Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
+			options.SetMinimumConfidenceThreshold(0.7)
+			outputFile = $outputPath + "Email_Classified.json"
+			DataExtractionModule.ExtractData($inputPath + "Email.pdf", outputFile, DataExtractionModule::E_DocClassification, options)
+
+			puts "Result saved in " + outputFile
+			
+		rescue => error
+			puts "Unable to extract document structure data, error: " + error.message
+		end
+	end
+
 	#-----------------------------------------------------------------------------------
 
 	PDFNet.Terminate