Skip to content

Commit 446136b

Browse files
jriesmeiermrbean-bremen
authored andcommitted
DcmCharString: add some support for multi-byte characters
- add DcmCharString::getVM() and getOFString(), which handle multi-byte charsets - DcmByteString::containsExtendedCharacters(): add check for ESCAPE characters (only allowed in code extensions) - removed obsolete DcmCharString::containsExtendedCharacters()
1 parent 947f2b4 commit 446136b

File tree

20 files changed

+500
-94
lines changed

20 files changed

+500
-94
lines changed

.github/workflows/cmake-win.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333
3434
# Uncompress support libraries into directory c:\dcmtk_support\libs.
3535
# We rename the original directory to libs so that the rest of the script
36-
# can use the ame path even if the support library package is updated
36+
# can use the same path even if the support library package is updated
3737
# in the future in the download task above.
3838
- name: Uncompress Support libraries
3939
shell: pwsh

dcmdata/include/dcmtk/dcmdata/dcbytstr.h

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -274,20 +274,19 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement
274274
*/
275275
virtual OFCondition verify(const OFBool autocorrect = OFFalse);
276276

277-
/** check if this element contains non-ASCII characters. Please note that this check
278-
* is pretty simple and only works for single-byte character sets that do include
279-
* the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All character
280-
* codes below 128 are considered to be ASCII codes and all others are considered to
281-
* be non-ASCII.
277+
/** check if this element contains non-ASCII characters.
278+
* This works by checking for any byte values above 127, which works for any
279+
* single-byte code and for single-value multi-byte codes, and for ESC characters,
280+
* which will mean that a code extension is used.
282281
* @param checkAllStrings if true, also check elements with string values not affected
283282
* by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST,
284-
* UC and UT, i.e. none of the derived VR classes.
283+
* UC and UT.
285284
* @return true if element contains non-ASCII characters, false otherwise
286285
*/
287286
virtual OFBool containsExtendedCharacters(const OFBool checkAllStrings = OFFalse);
288287

289288
/** check if this element is affected by SpecificCharacterSet
290-
* @return always returns false since none of the derived VR classes is affected by
289+
* @return returns false, overwritten by derived VR classes that are affected by
291290
* the SpecificCharacterSet (0008,0005) element
292291
*/
293292
virtual OFBool isAffectedBySpecificCharacterSet() const;
@@ -379,6 +378,11 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement
379378
*/
380379
virtual OFCondition makeMachineByteString(const Uint32 length = 0);
381380

381+
/** check if the VR supports more than one value.
382+
* @return OFTrue
383+
*/
384+
virtual OFBool supportsMultiValue() const { return OFTrue; };
385+
382386
/** convert currently stored string value to DICOM representation.
383387
* It removes trailing spaces apart from a possibly required single padding
384388
* character (in case of odd string length).
@@ -420,11 +424,10 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement
420424

421425
/* --- static helper functions --- */
422426

423-
/** check if a given character string contains non-ASCII characters.
424-
* Please note that this check is pretty simple and only works for single-byte character
425-
* sets that do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other
426-
* words: All character codes below 128 are considered to be ASCII codes and all others
427-
* are considered to be non-ASCII.
427+
/** check if this element contains non-ASCII characters.
428+
* This works by checking for any byte values above 127, which works for any
429+
* single-byte code and for single-value multi-byte codes, and for ESC characters,
430+
* which will mean that a code extension is used.
428431
* @param stringVal character string to be checked
429432
* @param stringLen length of the string (number of characters without the trailing
430433
* NULL byte)

dcmdata/include/dcmtk/dcmdata/dcchrstr.h

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -107,15 +107,21 @@ class DCMTK_DCMDATA_EXPORT DcmCharString
107107
*/
108108
virtual OFCondition verify(const OFBool autocorrect = OFFalse);
109109

110-
/** check if this element contains non-ASCII characters. Please note that this check
111-
* is pretty simple and only works for single-byte character sets that do include
112-
* the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All character
113-
* codes below 128 are considered to be ASCII codes and all others are considered to
114-
* be non-ASCII.
115-
* @param checkAllStrings not used in this class
116-
* @return true if element contains non-ASCII characters, false otherwise
110+
/** get value multiplicity
111+
* @return number of string components (separated by a backslash)
117112
*/
118-
virtual OFBool containsExtendedCharacters(const OFBool checkAllStrings = OFFalse);
113+
virtual unsigned long getVM();
114+
115+
/** get a copy of a particular string component
116+
* @param stringVal variable in which the result value is stored
117+
* @param pos index of the value in case of multi-valued elements (0..vm-1)
118+
* @param normalize not used since string normalization depends on value representation
119+
* @return status, EC_Normal if successful, an error code otherwise
120+
*/
121+
virtual OFCondition getOFString(OFString &stringVal,
122+
const unsigned long pos,
123+
OFBool normalize = OFTrue);
124+
119125

120126
/** check if this element is affected by SpecificCharacterSet
121127
* @return always returns true since all derived VR classes are affected by the
@@ -169,6 +175,16 @@ class DCMTK_DCMDATA_EXPORT DcmCharString
169175
*/
170176
virtual const OFString& getDelimiterChars() const;
171177

178+
private:
179+
180+
/** helper method to get VM or component indexes from the value, considering the character encoding
181+
* @param pos index of the component, or -1 if only the VM is needed
182+
* @param start receives a pointer to the component with index "pos", if pos is >= 0
183+
* @param end a pointer to the component end, if pos is >= 0
184+
* @param vm receives the VM of the value if pos is -1
185+
* @return status, EC_Normal if successful, EC_IllegalParameter if pos is invalid
186+
*/
187+
OFCondition getIndexOfPosition(long pos, const char*& start, const char*& end, unsigned long& vm);
172188
};
173189

174190

dcmdata/include/dcmtk/dcmdata/dcitem.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -317,11 +317,10 @@ class DCMTK_DCMDATA_EXPORT DcmItem
317317
*/
318318
virtual OFBool containsUnknownVR() const;
319319

320-
/** check if this object contains non-ASCII characters at any nesting level. Please note
321-
* that this check is pretty simple and only works for single-byte character sets that
322-
* do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All
323-
* character codes below 128 are considered to be ASCII codes and all others are
324-
* considered to be non-ASCII.
320+
/** check if this element contains non-ASCII characters.
321+
* This works by checking for any byte values above 127, which works for any
322+
* single-byte code and for single-value multi-byte codes, and for ESC characters,
323+
* which will mean that a code extension is used.
325324
* @param checkAllStrings if true, also check elements with string values not affected
326325
* by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST,
327326
* UC and UT.

dcmdata/include/dcmtk/dcmdata/dcsequen.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -313,11 +313,10 @@ class DCMTK_DCMDATA_EXPORT DcmSequenceOfItems : public DcmElement
313313
*/
314314
virtual OFBool containsUnknownVR() const;
315315

316-
/** check if this object contains non-ASCII characters at any nesting level. Please note
317-
* that this check is pretty simple and only works for single-byte character sets that
318-
* do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All
319-
* character codes below 128 are considered to be ASCII codes and all others are
320-
* considered to be non-ASCII.
316+
/** check if this element contains non-ASCII characters.
317+
* This works by checking for any byte values above 127, which works for any
318+
* single-byte code and for single-value multi-byte codes, and for ESC characters,
319+
* which will mean that a code extension is used.
321320
* @param checkAllStrings if true, also check elements with string values not affected
322321
* by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST,
323322
* UC and UT.

dcmdata/include/dcmtk/dcmdata/dcvrlt.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -114,13 +114,6 @@ class DCMTK_DCMDATA_EXPORT DcmLongText
114114
virtual OFCondition checkValue(const OFString &vm = "",
115115
const OFBool oldFormat = OFFalse);
116116

117-
/** get the value multiplicity.
118-
* Since the backslash "\" is not regarded as a separator the value
119-
* multiplicity is always 1.
120-
* @return value multiplicity of the currently stored value
121-
*/
122-
virtual unsigned long getVM();
123-
124117
/** get a copy of a particular string component
125118
* @param stringVal variable in which the result value is stored
126119
* @param pos index of the value in case of multi-valued elements (0..vm-1)
@@ -151,6 +144,14 @@ class DCMTK_DCMDATA_EXPORT DcmLongText
151144
*/
152145
static OFCondition checkStringValue(const OFString &value,
153146
const OFString &charset = "");
147+
148+
protected:
149+
/** check if the VR supports more than one value.
150+
* Since the backslash "\" is not regarded as a separator,
151+
* multiple values cannot be encoded.
152+
* @return OFFalse
153+
*/
154+
virtual OFBool supportsMultiValue() const { return OFFalse; };
154155
};
155156

156157

dcmdata/include/dcmtk/dcmdata/dcvrst.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,13 +115,6 @@ class DCMTK_DCMDATA_EXPORT DcmShortText
115115
virtual OFCondition checkValue(const OFString &vm = "",
116116
const OFBool oldFormat = OFFalse);
117117

118-
/** get the value multiplicity.
119-
* Since the backslash "\" is not regarded as a separator the value
120-
* multiplicity is always 1.
121-
* @return value multiplicity of the currently stored value
122-
*/
123-
virtual unsigned long getVM();
124-
125118
/** get a copy of a particular string component
126119
* @param stringVal variable in which the result value is stored
127120
* @param pos index of the value in case of multi-valued elements (0..vm-1)
@@ -152,6 +145,13 @@ class DCMTK_DCMDATA_EXPORT DcmShortText
152145
*/
153146
static OFCondition checkStringValue(const OFString &value,
154147
const OFString &charset = "");
148+
protected:
149+
/** check if the VR supports more than one value.
150+
* Since the backslash "\" is not regarded as a separator,
151+
* multiple values cannot be encoded.
152+
* @return OFFalse
153+
*/
154+
virtual OFBool supportsMultiValue() const { return OFFalse; };
155155
};
156156

157157

dcmdata/include/dcmtk/dcmdata/dcvrut.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,6 @@ class DCMTK_DCMDATA_EXPORT DcmUnlimitedText
117117
virtual OFCondition checkValue(const OFString &vm = "",
118118
const OFBool oldFormat = OFFalse);
119119

120-
/** get the value multiplicity.
121-
* Since the backslash "\" is not regarded as a separator the value
122-
* multiplicity is always 1.
123-
* @return value multiplicity of the currently stored value
124-
*/
125-
virtual unsigned long getVM();
126-
127120
/** get a copy of a particular string component
128121
* @param stringVal variable in which the result value is stored
129122
* @param pos index of the value in case of multi-valued elements (0..vm-1)
@@ -154,6 +147,13 @@ class DCMTK_DCMDATA_EXPORT DcmUnlimitedText
154147
*/
155148
static OFCondition checkStringValue(const OFString &value,
156149
const OFString &charset = "");
150+
protected:
151+
/** check if the VR supports more than one value.
152+
* Since the backslash "\" is not regarded as a separator,
153+
* multiple values cannot be encoded.
154+
* @return OFFalse
155+
*/
156+
virtual OFBool supportsMultiValue() const { return OFFalse; };
157157
};
158158

159159

dcmdata/libsrc/dcbytstr.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,7 @@ OFBool DcmByteString::containsExtendedCharacters(const OFBool checkAllStrings)
766766
OFBool result = OFFalse;
767767
/* only check if parameter is true since derived VRs are not affected
768768
by the attribute SpecificCharacterSet (0008,0005) */
769-
if (checkAllStrings)
769+
if (checkAllStrings || isAffectedBySpecificCharacterSet())
770770
{
771771
char *str = NULL;
772772
Uint32 len = 0;
@@ -872,10 +872,10 @@ OFBool DcmByteString::containsExtendedCharacters(const char *stringVal,
872872
{
873873
if (stringVal != NULL)
874874
{
875-
for (size_t i = stringLen; i != 0; --i)
875+
for (size_t i = stringLen; i != 0; --i, ++stringVal)
876876
{
877-
/* check for 8 bit characters */
878-
if (OFstatic_cast(unsigned char, *stringVal++) > 127)
877+
/* check for 8 bit and Escape characters */
878+
if (*stringVal & 0x80 || *stringVal == 0x1b)
879879
return OFTrue;
880880
}
881881
}

0 commit comments

Comments
 (0)