From b22b806fcc2fa1ce4ed76ca42070f35b1b94a049 Mon Sep 17 00:00:00 2001 From: Tony Cook Date: Tue, 25 Sep 2018 11:18:40 +1000 Subject: (perl #133170) fatalize sysread/syswrite/recv/send on :utf8 handles This includes removing the :utf8 logic from pp_syswrite. pp_sysread retains it, since it's also used for read(). Tests that are specifically testing the behaviour against :utf8 handles have been removed (eg in lib/open.t), several other tests that incidentally used those functions on :utf8 handles have been adapted to use :raw handles instead (eg. op/readline.t). Test lib/sigtrap.t fails if STDERR is :utf8, in code from the original 5.000 commit, which is intended to run in a signal handler --- cpan/autodie/t/recv.t | 3 ++ lib/open.t | 122 +------------------------------------------------- pod/perldiag.pod | 17 +++---- pod/perlfunc.pod | 33 ++++---------- pp_sys.c | 80 ++++++--------------------------- t/io/utf8.t | 14 +++--- t/lib/croak/pp_sys | 20 +++++++++ t/lib/warnings/pp_sys | 24 ---------- t/op/gmagic.t | 9 ---- t/op/readline.t | 10 ++--- t/op/sysio.t | 28 +----------- t/uni/overload.t | 6 +-- t/uni/readline.t | 3 +- 13 files changed, 70 insertions(+), 299 deletions(-) diff --git a/cpan/autodie/t/recv.t b/cpan/autodie/t/recv.t index f67b2f8187..97c7a4360d 100644 --- a/cpan/autodie/t/recv.t +++ b/cpan/autodie/t/recv.t @@ -13,6 +13,8 @@ $SIG{PIPE} = 'IGNORE'; my ($sock1, $sock2); socketpair($sock1, $sock2, AF_UNIX, SOCK_STREAM, PF_UNSPEC); +binmode $sock1; +binmode $sock2; my $buffer; send($sock1, "xyz", 0); @@ -40,6 +42,7 @@ SKIP: { eval { my $string = "now is the time..."; open(my $fh, '<', \$string) or die("Can't open \$string for read"); + binmode $fh; # $fh isn't a socket, so this should fail. recv($fh,$buffer,1,0); }; diff --git a/lib/open.t b/lib/open.t index 5150c7f8a2..fa17f1a97c 100644 --- a/lib/open.t +++ b/lib/open.t @@ -8,7 +8,7 @@ BEGIN { require './charset_tools.pl'; } -plan 23; +plan 11; # open::import expects 'open' as its first argument, but it clashes with open() sub import { @@ -62,126 +62,6 @@ is( ${^OPEN}, ":raw :crlf\0:raw :crlf", is( $^H{'open_IO'}, 'crlf', 'should record last layer set in %^H' ); SKIP: { - skip("no perlio, no :utf8", 12) unless (find PerlIO::Layer 'perlio'); - - eval <utf8"); - print O chr(0x100); - close O; - open(I, "), 0x100, ":utf8 single wide character round-trip"); - close I; -EOE - - open F, ">a"; - @a = map { chr(1 << ($_ << 2)) } 0..5; # 0x1, 0x10, .., 0x100000 - unshift @a, chr(0); # ... and a null byte in front just for fun - print F @a; - close F; - - sub systell { - use Fcntl 'SEEK_CUR'; - sysseek($_[0], 0, SEEK_CUR); - } - - require bytes; # not use - - my $ok; - - open F, "<:utf8", "a"; - $ok = $a = 0; - for (@a) { - unless ( - ($c = sysread(F, $b, 1)) == 1 && - length($b) == 1 && - ord($b) == ord($_) && - systell(F) == ($a += bytes::length($b)) - ) { - print '# ord($_) == ', ord($_), "\n"; - print '# ord($b) == ', ord($b), "\n"; - print '# length($b) == ', length($b), "\n"; - print '# bytes::length($b) == ', bytes::length($b), "\n"; - print '# systell(F) == ', systell(F), "\n"; - print '# $a == ', $a, "\n"; - print '# $c == ', $c, "\n"; - last; - } - $ok++; - } - close F; - ok($ok == @a, - "on :utf8 streams sysread() should work on characters, not bytes"); - - sub diagnostics { - print '# ord($_) == ', ord($_), "\n"; - print '# bytes::length($_) == ', bytes::length($_), "\n"; - print '# systell(G) == ', systell(G), "\n"; - print '# $a == ', $a, "\n"; - print '# $c == ', $c, "\n"; - } - - - my %actions = ( - syswrite => sub { syswrite G, shift; }, - 'syswrite len' => sub { syswrite G, shift, 1; }, - 'syswrite len pad' => sub { - my $temp = shift() . "\243"; - syswrite G, $temp, 1; }, - 'syswrite off' => sub { - my $temp = "\351" . shift(); - syswrite G, $temp, 1, 1; }, - 'syswrite off pad' => sub { - my $temp = "\351" . shift() . "\243"; - syswrite G, $temp, 1, 1; }, - ); - - foreach my $key (sort keys %actions) { - # syswrite() on should work on characters, not bytes - open G, ">:utf8", "b"; - - print "# $key\n"; - $ok = $a = 0; - for (@a) { - unless ( - ($c = $actions{$key}($_)) == 1 && - systell(G) == ($a += bytes::length($_)) - ) { - diagnostics(); - last; - } - $ok++; - } - close G; - ok($ok == @a, - "on :utf8 streams syswrite() should work on characters, not bytes"); - - open G, "<:utf8", "b"; - $ok = $a = 0; - for (@a) { - unless ( - ($c = sysread(G, $b, 1)) == 1 && - length($b) == 1 && - ord($b) == ord($_) && - systell(G) == ($a += bytes::length($_)) - ) { - print '# ord($_) == ', ord($_), "\n"; - print '# ord($b) == ', ord($b), "\n"; - print '# length($b) == ', length($b), "\n"; - print '# bytes::length($b) == ', bytes::length($b), "\n"; - print '# systell(G) == ', systell(G), "\n"; - print '# $a == ', $a, "\n"; - print '# $c == ', $c, "\n"; - last; - } - $ok++; - } - close G; - ok($ok == @a, - "checking syswrite() output on :utf8 streams by reading it back in"); - } -} -SKIP: { skip("no perlio", 1) unless (find PerlIO::Layer 'perlio'); skip("no Encode", 1) unless $Config{extensions} =~ m{\bEncode\b}; skip("EBCDIC platform doesnt have 'use encoding' used by open ':locale'", 1) diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 2c1fe74a87..657a427b1d 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -3205,27 +3205,24 @@ neither as a system call nor an ioctl call (SIOCATMARK). Perl. The current valid ones are given in L. -=item %s() is deprecated on :utf8 handles. This will be a fatal error in Perl 5.30 +=item %s() isn't allowed on :utf8 handles -(D deprecated) The sysread(), recv(), syswrite() and send() operators are -deprecated on handles that have the C<:utf8> layer, either explicitly, or +(F) The sysread(), recv(), syswrite() and send() operators are +not allowed on handles that have the C<:utf8> layer, either explicitly, or implicitly, eg., with the C<:encoding(UTF-16LE)> layer. -Both sysread() and recv() currently use only the C<:utf8> flag for the stream, -ignoring the actual layers. Since sysread() and recv() do no UTF-8 +Previously sysread() and recv() currently use only the C<:utf8> flag for the stream, +ignoring the actual layers. Since sysread() and recv() did no UTF-8 validation they can end up creating invalidly encoded scalars. -Similarly, syswrite() and send() use only the C<:utf8> flag, otherwise ignoring -any layers. If the flag is set, both write the value UTF-8 encoded, even if +Similarly, syswrite() and send() used only the C<:utf8> flag, otherwise ignoring +any layers. If the flag is set, both wrote the value UTF-8 encoded, even if the layer is some different encoding, such as the example above. Ideally, all of these operators would completely ignore the C<:utf8> state, working only with bytes, but this would result in silently breaking existing code. -In Perl 5.30, it will no longer be possible to use sysread(), recv(), -syswrite() or send() to read or send bytes from/to :utf8 handles. - =item "%s" is more clearly written simply as "%s" in regex; marked by S<<-- HERE> in m/%s/ (W regexp) (only under C> or within C<(?[...])>) diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index a2fad3b8fc..316daff1cf 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -6284,14 +6284,9 @@ string otherwise. If there's an error, returns the undefined value. This call is actually implemented in terms of the L system call. See L for examples. -Note the I: depending on the status of the socket, either -(8-bit) bytes or characters are received. By default all sockets -operate on bytes, but for example if the socket has been changed using -L|/binmode FILEHANDLE, LAYER> to operate with the -C<:encoding(UTF-8)> I/O layer (see the L pragma), the I/O will -operate on UTF8-encoded Unicode -characters, not bytes. Similarly for the C<:encoding> layer: in that -case pretty much any characters can be read. +Note that if the socket has been marked as C<:utf8>, C will +throw an exception. The C<:encoding(...)> layer implicitly introduces +the C<:utf8> layer. See L|/binmode FILEHANDLE, LAYER>. =item redo LABEL X @@ -7083,14 +7078,9 @@ case it does a L syscall. Returns the number of characters sent, or the undefined value on error. The L syscall is currently unimplemented. See L for examples. -Note the I: depending on the status of the socket, either -(8-bit) bytes or characters are sent. By default all sockets operate -on bytes, but for example if the socket has been changed using -L|/binmode FILEHANDLE, LAYER> to operate with the -C<:encoding(UTF-8)> I/O layer (see L|/open FILEHANDLE,EXPR>, or -the L pragma), the I/O will operate on UTF-8 -encoded Unicode characters, not bytes. Similarly for the C<:encoding> -layer: in that case pretty much any characters can be sent. +Note that if the socket has been marked as C<:utf8>, C will +throw an exception. The C<:encoding(...)> layer implicitly introduces +the C<:utf8> layer. See L|/binmode FILEHANDLE, LAYER>. =item setpgrp PID,PGRP X X @@ -8723,10 +8713,8 @@ L|/eof FILEHANDLE> doesn't work well on device files (like ttys) anyway. Use L|/sysread FILEHANDLE,SCALAR,LENGTH,OFFSET> and check for a return value of 0 to decide whether you're done. -Note that if the filehandle has been marked as C<:utf8>, Unicode -characters are read instead of bytes (the LENGTH, OFFSET, and the -return value of L|/sysread FILEHANDLE,SCALAR,LENGTH,OFFSET> -are in Unicode characters). The C<:encoding(...)> layer implicitly +Note that if the filehandle has been marked as C<:utf8>, C will +throw an exception. The C<:encoding(...)> layer implicitly introduces the C<:utf8> layer. See L|/binmode FILEHANDLE, LAYER>, L|/open FILEHANDLE,EXPR>, and the L pragma. @@ -8887,10 +8875,7 @@ string other than the beginning. A negative OFFSET specifies writing that many characters counting backwards from the end of the string. If SCALAR is of length zero, you can only use an OFFSET of 0. -B: If the filehandle is marked C<:utf8>, Unicode characters -encoded in UTF-8 are written instead of bytes, and the LENGTH, OFFSET, and -return value of L|/syswrite FILEHANDLE,SCALAR,LENGTH,OFFSET> -are in (UTF8-encoded Unicode) characters. +B: If the filehandle is marked C<:utf8>, C will raise an exception. The C<:encoding(...)> layer implicitly introduces the C<:utf8> layer. Alternately, if the handle is not marked with an encoding but you attempt to write characters with code points over 255, raises an exception. diff --git a/pp_sys.c b/pp_sys.c index 4ae475d460..00faa7711f 100644 --- a/pp_sys.c +++ b/pp_sys.c @@ -1725,10 +1725,9 @@ PP(pp_sysread) if ((fp_utf8 = PerlIO_isutf8(IoIFP(io))) && !IN_BYTES) { if (PL_op->op_type == OP_SYSREAD || PL_op->op_type == OP_RECV) { - Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), - "%s() is deprecated on :utf8 handles. " - "This will be a fatal error in Perl 5.30", - OP_DESC(PL_op)); + Perl_croak(aTHX_ + "%s() isn't allowed on :utf8 handles", + OP_DESC(PL_op)); } buffer = SvPVutf8_force(bufsv, blen); /* UTF-8 may not have been set if they are all low bytes */ @@ -1939,7 +1938,6 @@ PP(pp_syswrite) const char *buffer; SSize_t retval; STRLEN blen; - STRLEN orig_blen_bytes; const int op_type = PL_op->op_type; bool doing_utf8; U8 *tmpbuf = NULL; @@ -1985,20 +1983,12 @@ PP(pp_syswrite) /* Do this first to trigger any overloading. */ buffer = SvPV_const(bufsv, blen); - orig_blen_bytes = blen; doing_utf8 = DO_UTF8(bufsv); if (PerlIO_isutf8(IoIFP(io))) { - Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), - "%s() is deprecated on :utf8 handles. " - "This will be a fatal error in Perl 5.30", - OP_DESC(PL_op)); - if (!SvUTF8(bufsv)) { - /* We don't modify the original scalar. */ - tmpbuf = bytes_to_utf8((const U8*) buffer, &blen); - buffer = (char *) tmpbuf; - doing_utf8 = TRUE; - } + Perl_croak(aTHX_ + "%s() isn't allowed on :utf8 handles", + OP_DESC(PL_op)); } else if (doing_utf8) { STRLEN tmplen = blen; @@ -2031,25 +2021,10 @@ PP(pp_syswrite) #endif { Size_t length = 0; /* This length is in characters. */ - STRLEN blen_chars; IV offset; - if (doing_utf8) { - if (tmpbuf) { - /* The SV is bytes, and we've had to upgrade it. */ - blen_chars = orig_blen_bytes; - } else { - /* The SV really is UTF-8. */ - /* Don't call sv_len_utf8 on a magical or overloaded - scalar, as we might get back a different result. */ - blen_chars = sv_or_pv_len_utf8(bufsv, buffer, blen); - } - } else { - blen_chars = blen; - } - if (MARK >= SP) { - length = blen_chars; + length = blen; } else { #if Size_t_size > IVSIZE length = (Size_t)SvNVx(*++MARK); @@ -2065,46 +2040,21 @@ PP(pp_syswrite) if (MARK < SP) { offset = SvIVx(*++MARK); if (offset < 0) { - if (-offset > (IV)blen_chars) { + if (-offset > (IV)blen) { Safefree(tmpbuf); DIE(aTHX_ "Offset outside string"); } - offset += blen_chars; - } else if (offset > (IV)blen_chars) { + offset += blen; + } else if (offset > (IV)blen) { Safefree(tmpbuf); DIE(aTHX_ "Offset outside string"); } } else offset = 0; - if (length > blen_chars - offset) - length = blen_chars - offset; - if (doing_utf8) { - /* Here we convert length from characters to bytes. */ - if (tmpbuf || SvGMAGICAL(bufsv) || SvAMAGIC(bufsv)) { - /* Either we had to convert the SV, or the SV is magical, or - the SV has overloading, in which case we can't or mustn't - or mustn't call it again. */ - - buffer = (const char*)utf8_hop((const U8 *)buffer, offset); - length = utf8_hop((U8 *)buffer, length) - (U8 *)buffer; - } else { - /* It's a real UTF-8 SV, and it's not going to change under - us. Take advantage of any cache. */ - I32 start = offset; - I32 len_I32 = length; - - /* Convert the start and end character positions to bytes. - Remember that the second argument to sv_pos_u2b is relative - to the first. */ - sv_pos_u2b(bufsv, &start, &len_I32); - - buffer += start; - length = len_I32; - } - } - else { - buffer = buffer+offset; - } + if (length > blen - offset) + length = blen - offset; + buffer = buffer+offset; + #ifdef PERL_SOCK_SYSWRITE_IS_SEND if (IoTYPE(io) == IoTYPE_SOCKET) { retval = PerlSock_send(fd, buffer, length, 0); @@ -2120,8 +2070,6 @@ PP(pp_syswrite) if (retval < 0) goto say_undef; SP = ORIGMARK; - if (doing_utf8) - retval = utf8_length((U8*)buffer, (U8*)buffer + retval); Safefree(tmpbuf); #if Size_t_size > IVSIZE diff --git a/t/io/utf8.t b/t/io/utf8.t index 2b700595c8..0bc8a5c2bf 100644 --- a/t/io/utf8.t +++ b/t/io/utf8.t @@ -10,7 +10,7 @@ skip_all_without_perlio(); no utf8; # needed for use utf8 not griping about the raw octets -plan(tests => 63); +plan(tests => 62); $| = 1; @@ -312,16 +312,14 @@ is($failed, undef); { # [perl #23428] Somethings rotten in unicode semantics open F, ">$a_file"; - binmode F, ":utf8"; - no warnings qw(deprecated); - syswrite(F, $a = chr(0x100)); + binmode F; + $a = "A"; + utf8::upgrade($a); + syswrite(F, $a); close F; - is( ord($a), 0x100, '23428 syswrite should not downgrade scalar' ); - like( $a, qr/^\w+/, '23428 syswrite should not downgrade scalar' ); + ok(utf8::is_utf8($a), '23428 syswrite should not downgrade scalar' ); } -# sysread() and syswrite() tested in lib/open.t since Fcntl is used - { # on a :utf8 stream should complain immediately with -w # if it finds bad UTF-8 (:encoding(utf8) works this way) diff --git a/t/lib/croak/pp_sys b/t/lib/croak/pp_sys index 8b7dc9d53d..be100da27a 100644 --- a/t/lib/croak/pp_sys +++ b/t/lib/croak/pp_sys @@ -73,3 +73,23 @@ open my $���������, "../harness"; opendir $���������, "."; EXPECT Cannot open $��������� as a dirhandle: it is already open as a filehandle at - line 5. +######## +# NAME sysread() disallowed on :utf8 +open my $fh, "<:raw", "../harness" or die "# $!"; +my $buf; +sysread $fh, $buf, 10; +binmode $fh, ':utf8'; +sysread $fh, $buf, 10; +EXPECT +sysread() isn't allowed on :utf8 handles at - line 5. +######## +# NAME syswrite() disallowed on :utf8 +my $file = "syswwarn.tmp"; +open my $fh, ">:raw", $file or die "# $!"; +syswrite $fh, 'ABC'; +binmode $fh, ':utf8'; +syswrite $fh, 'ABC'; +close $fh; +END { unlink $file; } +EXPECT +syswrite() isn't allowed on :utf8 handles at - line 5. diff --git a/t/lib/warnings/pp_sys b/t/lib/warnings/pp_sys index 90d3cc790d..5f6b83d2f6 100644 --- a/t/lib/warnings/pp_sys +++ b/t/lib/warnings/pp_sys @@ -890,30 +890,6 @@ sleep(-1); EXPECT sleep() with negative argument at - line 2. ######## -# NAME sysread() deprecated on :utf8 -open my $fh, "<:raw", "../harness" or die "# $!"; -my $buf; -sysread $fh, $buf, 10; -binmode $fh, ':utf8'; -sysread $fh, $buf, 10; -no warnings 'deprecated'; -sysread $fh, $buf, 10; -EXPECT -sysread() is deprecated on :utf8 handles. This will be a fatal error in Perl 5.30 at - line 5. -######## -# NAME syswrite() deprecated on :utf8 -my $file = "syswwarn.tmp"; -open my $fh, ">:raw", $file or die "# $!"; -syswrite $fh, 'ABC'; -binmode $fh, ':utf8'; -syswrite $fh, 'ABC'; -no warnings 'deprecated'; -syswrite $fh, 'ABC'; -close $fh; -unlink $file; -EXPECT -syswrite() is deprecated on :utf8 handles. This will be a fatal error in Perl 5.30 at - line 5. -######## # NAME stat on name with \0 use warnings; my @x = stat("./\0-"); diff --git a/t/op/gmagic.t b/t/op/gmagic.t index 210e8e5cc9..0ed575525f 100644 --- a/t/op/gmagic.t +++ b/t/op/gmagic.t @@ -76,15 +76,6 @@ expected_tie_calls(tied $c, 1, 2, 'chomping a ref'); expected_tie_calls(tied $c, 2, 2, 'calling sysread with tied buf'); close $h or die "$0 cannot close $outfile: $!"; - # Do this again, with a utf8 handle - $c = *foo; # 1 write - open $h, "<:utf8", $outfile; - no warnings 'deprecated'; - sysread $h, $c, 3, 7; # 1 read; 1 write - is $c, "*main::bar", 'what sysread wrote'; # 1 read - expected_tie_calls(tied $c, 2, 2, 'calling sysread with tied buf'); - close $h or die "$0 cannot close $outfile: $!"; - unlink_all $outfile; } diff --git a/t/op/readline.t b/t/op/readline.t index c2727fe829..ba4efa71a4 100644 --- a/t/op/readline.t +++ b/t/op/readline.t @@ -215,9 +215,8 @@ SKIP: { my $line = 'ascii'; my ( $in, $out ); pipe $in, $out; - binmode $out, ':utf8'; + binmode $out; binmode $in, ':utf8'; - no warnings qw(deprecated); syswrite $out, "...\n"; $line .= readline $in; @@ -228,10 +227,11 @@ SKIP: { my $line = "\x{2080} utf8";; my ( $in, $out ); pipe $in, $out; - binmode $out, ':utf8'; + binmode $out; binmode $in, ':utf8'; - no warnings qw(deprecated); - syswrite $out, "\x{2080}...\n"; + my $outdata = "\x{2080}...\n"; + utf8::encode($outdata); + syswrite $out, $outdata; $line .= readline $in; is( $line, "\x{2080} utf8\x{2080}...\n", 'appending from utf to utf8' ); diff --git a/t/op/sysio.t b/t/op/sysio.t index ebcf821d37..c6d9bd8917 100644 --- a/t/op/sysio.t +++ b/t/op/sysio.t @@ -6,7 +6,7 @@ BEGIN { set_up_inc('../lib'); } -plan tests => 48; +plan tests => 45; open(I, 'op/sysio.t') || die "sysio.t: cannot find myself: $!"; binmode I; @@ -221,32 +221,6 @@ close(I); unlink_all $outfile; -# Check that utf8 IO doesn't upgrade the scalar -{ - no warnings 'deprecated'; - open(I, ">$outfile") || die "sysio.t: cannot write $outfile: $!"; - # Will skip harmlessly on stdioperl - eval {binmode STDOUT, ":utf8"}; - die $@ if $@ and $@ !~ /^IO layers \(like ':utf8'\) unavailable/; - - # y diaresis is \w when UTF8 - $a = chr 255; - - unlike($a, qr/\w/); - - syswrite I, $a; - - # Should not be upgraded as a side effect of syswrite. - unlike($a, qr/\w/); - - # This should work - eval {syswrite I, 2;}; - is($@, ''); - - close(I); -} -unlink_all $outfile; - chdir('..'); 1; diff --git a/t/uni/overload.t b/t/uni/overload.t index 8e722c850e..161484500e 100644 --- a/t/uni/overload.t +++ b/t/uni/overload.t @@ -9,7 +9,7 @@ BEGIN { set_up_inc( '../lib' ); } -plan(tests => 217); +plan(tests => 193); package UTF8Toggle; use strict; @@ -158,8 +158,8 @@ my $tmpfile = tempfile(); foreach my $operator ('print', 'syswrite', 'syswrite len', 'syswrite off', 'syswrite len off') { - foreach my $layer ('', ':utf8') { - open my $fh, "+>$layer", $tmpfile or die $!; + foreach my $layer ('', $operator =~ /syswrite/ ? () : (':utf8')) { + open my $fh, "+>:raw$layer", $tmpfile or die $!; my $pad = $operator =~ /\boff\b/ ? "\243" : ""; my $trail = $operator =~ /\blen\b/ ? "!" : ""; my $u = UTF8Toggle->new("$pad$E_acute\n$trail"); diff --git a/t/uni/readline.t b/t/uni/readline.t index 893a290893..253efe3a42 100644 --- a/t/uni/readline.t +++ b/t/uni/readline.t @@ -29,8 +29,7 @@ like($@, qr/Modification of a read-only value attempted/, '[perl #19566]'); use strict; my $err; { - no warnings qw(deprecated); - open ���, '.' and sysread ���, $_, 1; + open ���, '.' and binmode ��� and sysread ���, $_, 1; $err = $! + 0; close ���; } -- 2.11.0