Skip to content

Commit

Permalink
Chore/31 refactor loader (#46)
Browse files Browse the repository at this point in the history
  • Loading branch information
henomis authored May 5, 2023
1 parent 8a53d60 commit 6528043
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 0 deletions.
25 changes: 25 additions & 0 deletions examples/loader/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package main

import (
"fmt"

"github.com/henomis/lingoose/loader"
)

func main() {

p := loader.NewPubmedLoader([]string{"33024307", "32265180"})

docs, err := p.Load()
if err != nil {
panic(err)
}

for _, doc := range docs {
fmt.Println(doc.Content)
fmt.Println("------")
fmt.Println(doc.Metadata)
fmt.Println("------")
}

}
File renamed without changes.
File renamed without changes.
83 changes: 83 additions & 0 deletions loader/pubmed.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package loader

import (
"encoding/json"
"fmt"
"io"
"net/http"

"github.com/henomis/lingoose/document"
"github.com/henomis/lingoose/types"
)

const pubMedBioCURLFormat = "https://ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/%s/unicode"

type pubMedDocument struct {
Documents []struct {
Passages []struct {
Text string `json:"text"`
} `json:"passages"`
} `json:"documents"`
}

type pubMedLoader struct {
pubMedIDs []string
}

func NewPubmedLoader(pubMedIDs []string) *pubMedLoader {
return &pubMedLoader{
pubMedIDs: pubMedIDs,
}
}

func (p *pubMedLoader) Load() ([]document.Document, error) {

documens := make([]document.Document, len(p.pubMedIDs))

for i, pubMedID := range p.pubMedIDs {

doc, err := p.load(pubMedID)
if err != nil {
return nil, err
}

documens[i] = *doc
}

return documens, nil
}

func (p *pubMedLoader) load(pubMedID string) (*document.Document, error) {

url := fmt.Sprintf(pubMedBioCURLFormat, pubMedID)
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()

jsonContent, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

var pubMedDocument pubMedDocument
err = json.Unmarshal(jsonContent, &pubMedDocument)
if err != nil {
return nil, err
}

content := ""
for _, document := range pubMedDocument.Documents {
for _, passage := range document.Passages {
content += passage.Text
}
}

return &document.Document{
Content: content,
Metadata: types.Meta{
"source": url,
},
}, nil
}
File renamed without changes.

0 comments on commit 6528043

Please sign in to comment.