Skip to content

Commit ce8b151

Browse files
committed
Update JuFiT to Version 1.2
* Update Semantic Groups from https://metamap.nlm.nih.gov/Docs/SemGroups_2018.txt * Removed an description error in Main.Java and the Readme file.
1 parent ff2ea70 commit ce8b151

11 files changed

+202
-127
lines changed

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,11 @@ target
55
.project
66
.classpath
77
/.DS_Store
8+
*.gz
9+
*.txt
10+
*.sh
11+
*.RRF
12+
*.nlm
13+
*.CHK
14+
*.MD5
15+
*.zip
2.72 MB
Binary file not shown.
2.72 MB
Binary file not shown.

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Copyright (c) 2018, JULIE Lab
1+
Copyright (c) 2020, JULIE Lab
22
All rights reserved.
33

44
Redistribution and use in source and binary forms, with or without

README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,15 @@ BSD license (2-clause).
1515

1616
Please cite [JuFiT: A Configurable Rule Engine for Filtering and Generating New Multilingual UMLS Terms](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4765630/) when using JuFiT in your research.
1717

18-
1918
## Usage
2019
```
21-
java -jar <JUFIT-JAR>
20+
java -jar <JuFiT-file.jar>
2221
```
2322
followed by (on the same line)
2423
```
25-
jufit <mrconso> <mrsty> <language> (--mrconso | --terms | --grounded | --complex) [--outFile=FILE] [--semanticGroup=GROUP]... [--rules=JSON] [--noFilter]
26-
jufit --help
27-
jufit --version
28-
24+
<mrconso> <mrsty> <language> (--mrconso | --terms | --grounded | --complex) [--outFile=FILE] [--semanticGroup=GROUP] ... [--rules=JSON] [--noFilter]
25+
--help
26+
--version
2927
Options:
3028
--help Show this screen
3129
--version Show the version number
@@ -44,3 +42,4 @@ We suggest updating your Java VM arguments to use at least 0.5GB of RAM, i.e., `
4442

4543
## Recent Changes:
4644
* 1.1 New output formats, new command line interface, change to license
45+
* 1.2 Semantic Group definition (from https://metamap.nlm.nih.gov/Docs/SemGroups_2018.txt)
Binary file not shown.

pom.xml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
1+
<project
2+
xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
24
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
36
<properties>
47
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
58
<maven.compiler.source>1.8</maven.compiler.source>
@@ -14,7 +17,7 @@
1417

1518
<modelVersion>4.0.0</modelVersion>
1619
<artifactId>JenaUmlsFilter</artifactId>
17-
<version>1.1</version>
20+
<version>1.2</version>
1821
<dependencies>
1922
<dependency>
2023
<groupId>junit</groupId>

src/main/java/de/julielab/provider/SemanticGroup.java

Lines changed: 23 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -25,94 +25,31 @@
2525
*
2626
* First entry on each enum, i.e., short name, was then manually added
2727
*
28-
* @author hellrich
28+
* Update from 04/27/2020:
29+
* The definition of the Semantic Groups and the Semantic Types changed.
30+
* We updated the Semantic Types of the Semantic Groups from
31+
* https://metamap.nlm.nih.gov/Docs/SemGroups_2018.txt
32+
*
33+
* @author hellrich, chlor
2934
*
3035
*/
36+
3137
public enum SemanticGroup {
32-
ACTI("Activities & Behaviors", "T052", "T053", "T056", "T051", "T064",
33-
"T055", "T066", "T057", "T054"), ANAT("Anatomy", "T017", "T029",
34-
"T023", "T030", "T031", "T022", "T025", "T026", "T018",
35-
"T021", "T024"), CHEM("Chemicals & Drugs", "T116", "T195",
36-
"T123", "T122", "T118", "T103", "T120", "T104",
37-
"T200", "T111", "T196", "T126", "T131", "T125",
38-
"T129", "T130", "T197", "T119", "T124", "T114",
39-
"T109", "T115", "T121", "T192", "T110",
40-
"T127"), CONC("Concepts & Ideas", "T185", "T077",
41-
"T169", "T102", "T078", "T170", "T171",
42-
"T080", "T081", "T089", "T082",
43-
"T079"), DEVI("Devices", "T203", "T074",
44-
"T075"), DISO("Disorders", "T020",
45-
"T190", "T049", "T019",
46-
"T047", "T050", "T033",
47-
"T037", "T048", "T191",
48-
"T046", "T184"), GENE(
49-
"Genes & Molecular Sequences",
50-
"T087", "T088",
51-
"T028", "T085",
52-
"T086"), GEOG(
53-
"Geographic Areas",
54-
"T083"), LIVB(
55-
"Living Beings",
56-
"T100",
57-
"T011",
58-
"T008",
59-
"T194",
60-
"T007",
61-
"T012",
62-
"T204",
63-
"T099",
64-
"T013",
65-
"T004",
66-
"T096",
67-
"T016",
68-
"T015",
69-
"T001",
70-
"T101",
71-
"T002",
72-
"T098",
73-
"T097",
74-
"T014",
75-
"T010",
76-
"T005"), OBJC(
77-
"Objects",
78-
"T071",
79-
"T168",
80-
"T073",
81-
"T072",
82-
"T167"), OCCU(
83-
"Occupations",
84-
"T091",
85-
"T090"), ORGA(
86-
"Organizations",
87-
"T093",
88-
"T092",
89-
"T094",
90-
"T095"), PHEN(
91-
"Phenomena",
92-
"T038",
93-
"T069",
94-
"T068",
95-
"T034",
96-
"T070",
97-
"T067"), PHYS(
98-
"Physiology",
99-
"T043",
100-
"T201",
101-
"T045",
102-
"T041",
103-
"T044",
104-
"T032",
105-
"T040",
106-
"T042",
107-
"T039"), PROC(
108-
"Procedures",
109-
"T060",
110-
"T065",
111-
"T058",
112-
"T059",
113-
"T063",
114-
"T062",
115-
"T061");
38+
ACTI("Activities & Behaviors", "T052", "T053", "T056", "T051", "T064", "T055", "T066", "T057", "T054"),
39+
ANAT("Anatomy", "T017", "T029", "T023", "T030", "T031", "T022", "T025", "T026", "T018", "T021", "T024"),
40+
CHEM("Chemicals & Drugs", "T116", "T195", "T123", "T122", "T118", "T103", "T120", "T104", "T200", "T111", "T196", "T126", "T131", "T125", "T129", "T130", "T197", "T119", "T124", "T114", "T109", "T115", "T121", "T192", "T110", "T127"),
41+
CONC("Concepts & Ideas", "T185", "T077", "T169", "T102", "T078", "T170", "T171", "T080", "T081", "T089", "T082", "T079"),
42+
DEVI("Devices", "T203", "T074", "T075"),
43+
DISO("Disorders", "T020", "T190", "T049", "T019", "T047", "T050", "T033", "T037", "T048", "T191", "T046", "T184"),
44+
GENE("Genes & Molecular Sequences", "T087", "T088", "T028", "T085", "T086"),
45+
GEOG("Geographic Areas", "T083"),
46+
LIVB("Living Beings", "T100", "T011", "T008", "T194", "T007", "T012", "T204", "T099", "T013", "T004", "T096", "T016", "T015", "T001", "T101", "T002", "T098", "T097", "T014", "T010", "T005"),
47+
OBJC("Objects", "T071", "T168", "T073", "T072", "T167"),
48+
OCCU("Occupations", "T091", "T090"),
49+
ORGA("Organizations", "T093", "T092", "T094", "T095"),
50+
PHEN("Phenomena", "T038", "T069", "T068", "T034", "T070", "T067"),
51+
PHYS("Physiology", "T043", "T201", "T045", "T041", "T044", "T032", "T040", "T042", "T039"),
52+
PROC("Procedures", "T060", "T065", "T058", "T059", "T063", "T062", "T061");
11653

11754
private final static Map<String, SemanticGroup> termId2group = new HashMap<>();
11855

@@ -141,5 +78,4 @@ private SemanticGroup(final String... strings) {
14178
termIds = ImmutableSet
14279
.copyOf(Arrays.asList(strings).subList(1, strings.length));
14380
}
144-
145-
}
81+
}

src/main/java/de/julielab/umlsfilter/cli/Main.java

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,13 @@
3030

3131
public class Main {
3232

33-
public static final String VERSION = "1.1";
33+
public static final String VERSION = "1.2";
3434

3535
private static final String doc = "Usage:\n"
36-
+ " jufit <mrconso> <mrsty> <language> (--mrconso | --terms | --grounded | --complex) [--outFile=FILE] [--semanticGroup=GROUP]... [--rules=JSON] [--noFilter]\n"
37-
+ " jufit --help\n" + " jufit --version\n" + "\nOptions:\n"
36+
+ " java -jar <JuFiT-file.jar> "
37+
+ " followed by (on the same line)"
38+
+ " <mrconso> <mrsty> <language> (--mrconso | --terms | --grounded | --complex) [--outFile=FILE] [--semanticGroup=GROUP]... [--rules=JSON] [--noFilter]\n"
39+
+ " --help\n" + " jufit --version\n" + "\nOptions:\n"
3840
+ "--help Show this screen\n"
3941
+ "--version Show the version number\n"
4042
+ "--mrconso MRCONSO output format (one format must be chosen)\n"
@@ -49,67 +51,67 @@ public class Main {
4951

5052
@SuppressWarnings("unchecked")
5153
public static void main(final String[] args) throws IOException {
52-
final Map<String, Object> opts = new Docopt(doc).withVersion(VERSION)
53-
.parse(args);
54+
final Map<String, Object> opts = new Docopt(doc).withVersion(VERSION).parse(args);
5455
final String pathToMRCONSO = (String) opts.get("<mrconso>");
5556
final String pathToMRSTY = (String) opts.get("<mrsty>");
5657
final String language = (String) opts.get("<language>");
58+
5759
if (language.length() != 3) {
58-
System.err.println(
59-
"Only 3 letter languages codes supported, e.g., ENG for English");
60+
System.err.println("Only 3 letter languages codes supported, e.g., ENG for English");
6061
System.exit(1);
6162
}
62-
final String jsonFile = (String) opts.get("--rules"); //may be null, respected later
6363

64+
final String jsonFile = (String) opts.get("--rules"); //may be null, respected later
6465
final Set<SemanticGroup> onlyTheseSemanticGroups = new HashSet<>();
66+
6567
try {
6668
((List<String>) opts.get("--semanticGroup")).stream()
6769
.map(SemanticGroup::valueOf)
6870
.forEach(onlyTheseSemanticGroups::add);
6971
} catch (final IllegalArgumentException e) {
7072
System.err.println(
7173
"Only the following semantic group names are supported:\n"
72-
+ SemanticGroup.getNames()
73-
.collect(Collectors.joining(", ")));
74+
+ SemanticGroup.getNames().collect(Collectors.joining(", ")));
7475
System.exit(1);
7576
}
7677

7778
OutputFormat outputFormat = null;
78-
if ((boolean) opts.get("--mrconso"))
79+
if ((boolean) opts.get("--mrconso")){
7980
outputFormat = OutputFormat.MRCONSO;
80-
else if ((boolean) opts.get("--terms"))
81+
}
82+
else if ((boolean) opts.get("--terms")){
8183
outputFormat = OutputFormat.TERMS;
82-
else if ((boolean) opts.get("--grounded"))
84+
}
85+
else if ((boolean) opts.get("--grounded")){
8386
outputFormat = OutputFormat.GROUNDED_TERMS;
84-
else if ((boolean) opts.get("--complex"))
87+
}
88+
else if ((boolean) opts.get("--complex")){
8589
outputFormat = OutputFormat.COMPLEX;
86-
if (outputFormat == null)
87-
throw new IllegalArgumentException(
88-
"No valid output format selected!");
90+
}
91+
if (outputFormat == null){
92+
throw new IllegalArgumentException("No valid output format selected!");
93+
}
8994

9095
final boolean applyFilters = !(boolean) opts.get("--noFilter");
91-
if (!applyFilters && (OutputFormat.MRCONSO == outputFormat))
92-
throw new IllegalArgumentException(
93-
"Applying no filtering while producing MRCONSO format is pointless");
96+
if (!applyFilters && (OutputFormat.MRCONSO == outputFormat)){
97+
throw new IllegalArgumentException("Applying no filtering while producing MRCONSO format is pointless");
98+
}
9499

95100
final String outFileName = (String) opts.get("--outFile"); //may be null, respected later
96-
if(null != outFileName)
101+
if(null != outFileName){
97102
System.setOut(new PrintStream(new BufferedOutputStream(new FileOutputStream(outFileName)), true));
98-
103+
}
104+
99105
//Iterate over UMLS to generate list of existing terms
100106
//TODO Currently respects pre-existing terms of all semantic groups, even those later ignored. Trivial change, unsure what is expected behavior?
101107
final Set<String> existingTerms = Streams
102-
.stream(UMLSTermProvider.provideUMLSTerms(pathToMRCONSO,
103-
pathToMRSTY, true, null, language))
104-
.map(ProvidedTerm::getTerm).collect(Collectors.toSet());
108+
.stream(UMLSTermProvider.provideUMLSTerms(pathToMRCONSO, pathToMRSTY, true, null, language))
109+
.map(ProvidedTerm::getTerm).collect(Collectors.toSet());
105110

106111
//Prepare to iterate over UMLS again, this time respecting group restrictions (if any)
107-
final Iterator<ProvidedTerm> iterator = UMLSTermProvider
108-
.provideUMLSTerms(pathToMRCONSO, pathToMRSTY, true,
109-
onlyTheseSemanticGroups, language);
110-
111-
Delemmatizer.delemmatize(iterator, outputFormat, existingTerms,
112-
jsonFile, language, applyFilters);
112+
final Iterator<ProvidedTerm> iterator =
113+
UMLSTermProvider.provideUMLSTerms(pathToMRCONSO, pathToMRSTY, true, onlyTheseSemanticGroups, language);
113114

115+
Delemmatizer.delemmatize(iterator, outputFormat, existingTerms, jsonFile, language, applyFilters);
114116
}
115117
}

0 commit comments

Comments
 (0)