@ -44,6 +44,11 @@ except LookupError:
HAS_SURROGATEESCAPE = False
_COMPOSED_ERROR_HANDLERS = frozenset ( ( None , ' surrogate_or_escape ' ,
' surrogate_or_strict ' ,
' surrogate_then_replace ' ) )
def to_bytes ( obj , encoding = ' utf-8 ' , errors = None , nonstring = ' simplerepr ' ) :
""" Make sure that a string is a byte string
@ -56,22 +61,35 @@ def to_bytes(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
: kwarg errors : The error handler to use if the text string is not
encodable using the specified encoding . Any valid ` codecs error
handler < https : / / docs . python . org / 2 / library / codecs . html #codec-base-classes>`_
may be specified . There are t wo additional error strategies
specifically aimed at helping people to port code :
may be specified . There are t hree additional error strategies
specifically aimed at helping people to port code . The first two are :
: surrogate_or_strict : Will use surrogateescape if it is a valid
handler , otherwise it will use strict
: surrogate_or_replace : Will use surrogateescape if it is a valid
handler , otherwise it will use replace .
: surrogate_or_strict : Will use ` ` surrogateescape ` ` if it is a valid
handler , otherwise it will use ` ` strict ` `
: surrogate_or_replace : Will use ` ` surrogateescape ` ` if it is a valid
handler , otherwise it will use ` ` replace ` ` .
Because surrogateescape was added in Python3 this usually means that
Python3 will use surrogateescape and Python2 will use the fallback
error handler . Note that the code checks for surrogateescape when the
module is imported . If you have a backport of surrogateescape for
p ython2, be sure to register the error handler prior to importing this
Because ` ` surrogateescape ` ` was added in Python3 this usually means that
Python3 will use ` ` surrogateescape ` ` and Python2 will use the fallback
error handler . Note that the code checks for ` ` surrogateescape ` ` when the
module is imported . If you have a backport of ` ` surrogateescape ` ` for
P ython2, be sure to register the error handler prior to importing this
module .
The default is ` surrogate_or_replace `
The last error handler is :
: surrogate_then_replace : Will use ` ` surrogateescape ` ` if it is a valid
handler . If encoding with ` ` surrogateescape ` ` would traceback ,
surrogates are first replaced with a replacement characters
and then the string is encoded using ` ` replace ` ` ( which replaces
the rest of the nonencodable bytes ) . If ` ` surrogateescape ` ` is
not present it will simply use ` ` replace ` ` . ( Added in Ansible 2.3 )
This strategy is designed to never traceback when it attempts
to encode a string .
The default until Ansible - 2.2 was ` ` surrogate_or_replace ` `
From Ansible - 2.3 onwards , the default is ` ` surrogate_then_replace ` ` .
: kwarg nonstring : The strategy to use if a nonstring is specified in
` ` obj ` ` . Default is ' simplerepr ' . Valid values are :
@ -90,23 +108,36 @@ def to_bytes(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
byte string is in the specified encoding do : :
encoded_string = to_bytes ( to_text ( input_string , ' latin-1 ' ) , ' utf-8 ' )
. . version_changed : : 2.3
Added the ` ` surrogate_then_replace ` ` error handler and made it the default error handler .
"""
if isinstance ( obj , binary_type ) :
return obj
if errors in ( None , ' surrogate_or_replace ' ) :
# We're given a text string
# If it has surrogates, we know because it will decode
original_errors = errors
if errors in _COMPOSED_ERROR_HANDLERS :
if HAS_SURROGATEESCAPE :
errors = ' surrogateescape '
else :
errors = ' replace '
elif errors == ' surrogate_or_strict ' :
if HAS_SURROGATEESCAPE :
errors = ' surrogateescape '
else :
errors = ' strict '
else :
errors = ' replace '
if isinstance ( obj , text_type ) :
try :
# Try this first as it's the fastest
return obj . encode ( encoding , errors )
except UnicodeEncodeError :
if original_errors in ( None , ' surrogate_then_replace ' ) :
# Slow but works
return_string = obj . encode ( ' utf-8 ' , ' surrogateescape ' )
return_string = return_string . decode ( ' utf-8 ' , ' replace ' )
return return_string . encode ( encoding , ' replace ' )
raise
# Note: We do these last even though we have to call to_bytes again on the
# value because we're optimizing the common case
@ -144,8 +175,27 @@ def to_text(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
: kwarg errors : The error handler to use if the byte string is not
decodable using the specified encoding . Any valid ` codecs error
handler < https : / / docs . python . org / 2 / library / codecs . html #codec-base-classes>`_
may be specified . On Python3 this defaults to ' surrogateescape ' . On
Python2 , this defaults to ' replace ' .
may be specified . We support three additional error strategies
specifically aimed at helping people to port code :
: surrogate_or_strict : Will use surrogateescape if it is a valid
handler , otherwise it will use strict
: surrogate_or_replace : Will use surrogateescape if it is a valid
handler , otherwise it will use replace .
: surrogate_then_replace : Does the same as surrogate_or_replace but
` was added for symmetry with the error handlers in
: func : ` ansible . module_utils . _text . to_bytes ` ( Added in Ansible 2.3 )
Because surrogateescape was added in Python3 this usually means that
Python3 will use ` surrogateescape ` and Python2 will use the fallback
error handler . Note that the code checks for surrogateescape when the
module is imported . If you have a backport of ` surrogateescape ` for
python2 , be sure to register the error handler prior to importing this
module .
The default until Ansible - 2.2 was ` surrogate_or_replace `
In Ansible - 2.3 this defaults to ` surrogate_then_replace ` for symmetry
with : func : ` ansible . module_utils . _text . to_bytes ` .
: kwarg nonstring : The strategy to use if a nonstring is specified in
` ` obj ` ` . Default is ' simplerepr ' . Valid values are :
@ -158,22 +208,27 @@ def to_text(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
: returns : Typically this returns a text string . If a nonstring object is
passed in this may be a different type depending on the strategy
specified by nonstring . This will never return a byte string .
From Ansible - 2.3 onwards , the default is ` surrogate_then_replace ` .
. . version_changed : : 2.3
Added the surrogate_then_replace error handler and made it the default error handler .
"""
if isinstance ( obj , text_type ) :
return obj
if errors in ( None , ' surrogate_or_replace ' ) :
if errors in _COMPOSED_ERROR_HANDLERS :
if HAS_SURROGATEESCAPE :
errors = ' surrogateescape '
else :
errors = ' replace '
elif errors == ' surrogate_or_strict ' :
if HAS_SURROGATEESCAPE :
errors = ' surrogateescape '
else :
errors = ' strict '
else :
errors = ' replace '
if isinstance ( obj , binary_type ) :
# Note: We don't need special handling for surrogate_then_replace
# because all bytes will either be made into surrogates or are valid
# to decode.
return obj . decode ( encoding , errors )
# Note: We do these last even though we have to call to_text again on the