Hacker News: textmode

New comment by textmode in "yy092: busybox omits csplit"

textmode — Thu, 27 Jun 2024 06:50:29 +0000

correction:

sed -n '/pattern/=' file|yy092|sed -nf/dev/stdin file

yy092: busybox omits csplit

textmode — Thu, 27 Jun 2024 02:37:00 +0000

NB. Try using busybox awk instead of yy092

usage:

sed -n '/pattern/='|yy092|sed -nf/dev/stdin file

       /*
         context split using sed 
       */
   
    int fileno (FILE *);
    int x,y,z;
   %option nounput noinput noyywrap 
   %%
   ^[0-9]+$ {
    if(x){fprintf(stdout,"%d,%dwxx%02d\n",z,atoi(yytext)-1,y);x=0;y++;}
    if(!x){z=atoi(yytext);x++;}
    }
   .|\n
   %%
    int main(){yylex();exit(0);}

Comments URL: https://news.ycombinator.com/item?id=40806677

Points: 1

# Comments: 1

New comment by textmode in "Remove HTTP headers from gzip or zip on stdin yy054 (revised)"

textmode — Thu, 28 Dec 2023 21:37:30 +0000

Correction:

      /* remove HTTP headers from multiple gzip or single zip from stdin */
    
     int fileno (FILE *);
     int setenv (const char *, const char *, int);
     #define jmp (yy_start) = 1 + 2 *
     int x;
    %option nounput noinput noyywrap
    %%
    HTTP\/[\40-\176]+\x0d\x0a x++;
    [\40-\176]+:[\40-\176]+\r\n if(!x)fwrite(yytext,1,yyleng,yyout);
    \x0D\x0A if(!x)fwrite(yytext,1,yyleng,yyout);x=0;
    %%
    int main()
    { 
    yylex();
    exit(0);
    }

Usage example:

Retrieve hostnames, IP addresses and (if available) sitemap URLs from latest Common Crawl.

     ftp -4 https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-50/robotstxt.paths.gz # <-- 180K
     gzip -dc robotstxt.paths.gz \
     |head -5 \
     |sed 's>.*>GET /& HTTP/1.1[]Host: data.commoncrawl.org[]Connection: >;
           $!s/$/keep-alive[]/;$s/$/close[]/' \
     |tr [] '\r\n' \
     |openssl s_client -quiet -connect data.commoncrawl.org:443 \
     |yy054 \
     |zegrep -a '(^Sitemap:)|(^Host:)|(^WARC-Target-URI:)|(^WARC-IP-Address:)' > 1.txt
     exec cat 1.txt

New comment by textmode in "Remove HTTP headers from gzip or zip on stdin yy054 (revised)"

textmode — Wed, 27 Dec 2023 01:09:36 +0000

Usage example:

Download NetBSD 1.0 in a single TCP connection.

    y="GET /pub/NetBSD-archive/NetBSD-1.0/source/src10/"
    z="Host: archive.netbsd.org"
    sed '$!s>.*>'"$y"'& HTTP/1.1[]'"$z"'[]Connection: keep-alive[]>;
         $s>.*>'"$y"'& HTTP/1.0[]'"$z"'[]>' << eof \
    |tr '[]' '\r\n' \
    |openssl s_client -quiet -connect 151.101.129.6:443 -servername archive.netbsd.org > http+gzip
    src10.aa
    src10.ab
    src10.ac
    src10.ad
    src10.ae
    src10.af
    src10.ag
    src10.ah
    src10.ai
    src10.aj
    src10.ak
    src10.al
    src10.am
    src10.an
    src10.ao
    src10.ap
    src10.aq
    src10.ar
    src10.as
    src10.at
    src10.au
    src10.av
    src10.aw
    src10.ax
    src10.ay
    src10.az
    src10.ba
    src10.bb
    src10.bc
    src10.bd
    src10.be
    src10.bf
    eof

    yy054 < http+gzip|tar tvzf /dev/stdin

Alternate usage:

Include an argv[1] will print HTTP headers only

    yy054 print < http+gzip
    yy054 x < http+gzip

Remove HTTP headers from gzip or zip on stdin yy054 (revised)

textmode — Tue, 26 Dec 2023 23:32:16 +0000

      /* remove HTTP headers from multiple gzip or single zip from stdin */
    
     int fileno (FILE *);
     int setenv (const char *, const char *, int);
     #define jmp (yy_start) = 1 + 2 *
     int show_headers;
    %option nounput noinput noyywrap
    %%
    HTTP\/[01]\.[\15\12\40-\176]{0,1024}\r\n\r\n if(show_headers){fwrite(yytext,1,yyleng,yyout);putc(10,yyout);};
    .|\n if(!show_headers)fwrite(yytext,1,yyleng,yyout);
    %%
    int main(int argc,char *argv[])
    { 
    if(argc)if(argv[0])
    if(argv[1])show_headers++;
    yylex();
    exit(0);
    }

Comments URL: https://news.ycombinator.com/item?id=38777237

Points: 3

# Comments: 2

New comment by textmode in "Extract URLs Relative and/or Absolute yy044"

textmode — Thu, 21 Dec 2023 22:20:23 +0000

Normally I use yy030 but I have been experimenting with this instead.

Seems to be slightly faster and smaller than similar programs from html-xml-utils.

https://www.w3.org/Tools/HTML-XML-utils/man1/

Compile:

   links -no-connect -dump https://news.ycombinator.com/item?id=38727772 \
   |sed '1,4d;77,$d;s/[ ]\{6\}//' \
   |flex -8Cem;cc -O3 -std=c89 -W -Wall -pipe lex.yy.c -static -o yy044
   strip -s yy044

Example usage:

      # NB. not a real cookie
      curl -H "cookie=user=santa&K7RGzmUtAoKv9OIRMfQ9bfwYpiDEuypp" -siA "" \
      https://news.ycombinator.com \
      |host=news.ycombinator.com/ yy044 r \
      |sed -n 's/&/\&/g;/vote/p'

Extract URLs Relative and/or Absolute yy044

textmode — Thu, 21 Dec 2023 22:11:11 +0000

       /*
            experimental
            usage: [host=example.com] yy044 [ar]
       */
    int fileno (FILE *);
    int x,y,absolute,relative,omit,proto;
    int setenv (const char *, const char *, int);
    int fprintf(FILE *__restrict, const char *__restrict, ...);
    size_t strlen (const char *);
   http https://|http://
   js javascript:
   %option nounput noinput noyywrap 
   %%
   href=[^{http}{js}\"'][^\"'\40>]+ puts("yy044");
   
   href=[\"'][^{http}{js}][^\"'\40>]+ {
    omit=0;proto=0;
    if(yytext[6]==46)omit++;
    if(yytext[7]==47)proto++;
    if(!omit)if(relative)
    {
    y=0;for(x=6;x\43\42\47|\r\n]+ {
    if(absolute)
    if(yyleng>8)
    {
    y=0;for(x=0;x\43\42\47|\r\n]+ {
    if(absolute)
    if(yyleng>7)
    {
    y=0;for(x=0;x



Comments URL: https://news.ycombinator.com/item?id=38727772
Points: 1
# Comments: 1



Gzip and Zip File Extraction from stdin
textmode — Wed, 04 Jan 2023 09:23:57 +0000

   /* remove HTTP headers from multiple gzip or single zip from stdin */

    int fileno(FILE *); 
    #define jmp (yy_start) = 1 + 2 *
    #define echo do{if(fwrite(yytext,(size_t)yyleng,1,yyout)){}}while(0)

   xa "\x1f\x8b"    
   /* GZIP magic number */
   xb "\x50\x4B\x03\x04"
   /* ZIP magic number */

   %s xa
   %option noyywrap noinput nounput
   %%
   {xa}|{xb} echo;jmp xa;
   "HTTP" jmp 0;
   .|\n echo;
   \n|.
   %%
   int main(){ yylex();exit(0);}


Comments URL: https://news.ycombinator.com/item?id=34243659
Points: 2
# Comments: 0



New comment by textmode in "Chunked-transfer decoding from stdin yy045"
textmode — Sun, 02 Oct 2022 10:15:28 +0000

   /* chunked transfer decoding */
   
    #define echo do{if(fwrite(yytext,(size_t)yyleng,1,yyout)){}}while(0)
    #define jmp (yy_start) = 1 + 2 *
    int fileno (FILE *);
    int ischunked,chunksize,count;
   xa "\15"|"\12"
   xb "\15\12" 
   xc "HTTP/0.9"|"HTTP/1.0"|"HTTP/1.1"
   xd [Cc][Hh][Uu][Nn][Kk][Ee][Dd]
   xe [0-9a-fA-F]+\r\n
   xf [0-9a-fA-F]*\r\n
   %option noyywrap nounput noinput 
   %s xb xc xd xe xf
   %%
   ^{xc} echo;ischunked=0;jmp xc;
   ^transfer-encoding: echo;jmp xb;
   \r\n\r\n echo;jmp xe;
   {xd} echo;ischunked=1;
   {xf}|{xe} {
   count=0;
   if(ischunked==1)
   {chunksize=strtol(yytext,NULL,16);
   jmp xd;};
   };
   {xb} jmp xf;
   . { 
   count++;
   if(count==chunksize)jmp xe;
   echo;
   };
   ^[A-Fa-f0-9]+{xa}
   {xa}+[A-Fa-f0-9]+{xa}
   {xb}[A-Fa-f0-9]+{xb}
   %%
   int main(){ yylex();exit(0);}




Chunked-transfer decoding from stdin yy045
textmode — Sun, 02 Oct 2022 10:14:20 +0000


Comments URL: https://news.ycombinator.com/item?id=33054601
Points: 1
# Comments: 1



Not all grep implementations provide an “-o” option
textmode — Thu, 03 Feb 2022 00:47:29 +0000

For example, Plan9 grep.
Solution:
       #!/bin/sh
       # usage: $0 pattern < file
       x=$(echo x|tr x '\34');
       tr -d '\12'|sed "s/$x//g;s/$1/$x&$x/g"|tr $x '\12'|sed -n "/^$1/p"


Comments URL: https://news.ycombinator.com/item?id=30186762
Points: 2
# Comments: 1



New comment by textmode in "A call to minimize distraction and respect users’ attention (2013)"
textmode — Thu, 08 Jul 2021 09:27:42 +0000

Below is a short script that downloads and makes a PDF from the image files.  No browser required.
The script uses a feature of HTTP/1.1 called pipelining; proponents of HTTP/2 and HTTP/3 want people to believe it has problems because it does not fit their commercialised web business model.
As demonstrated by the script below, it has no problems.
It's a feature that simply does not suit the online ad industry-funded business model with its gigantic corporate browser, bloated conglomeration web pages and incessant data collection.
Here, only 2 TCP connections are used to retrieve 141 images.  
Most servers are less restrictive and allow more than 100 requests per TCP connection.
Pipelining works great.  Much more efficient than browsers which open hundreds of connections.
IMHO.
    (export Connection=keep-alive
    x1=http://www.minimizedistraction.com/img/vrg_google_doc_final_vrs03-
    x2(){ seq -f "$x1%g.jpg" $1 $2;};
    x3(){ yy025|nc -vvn 173.236.175.199 80;};
    x2   1 100|x3;
    x2 101 200|x3;
    )|exec yy056|exec od -An -tx1 -vw99999|exec tr -d '\40'|exec sed 's/ffd9ffd8/ffd9\
    ffd8/g'|exec sed -n /ffd8/p|exec split -l1;
    for x in x??;do xxd -p -r < $x > $x.jpg;rm $x;done;
    convert x??.jpg 1.pdf 2>/dev/null;rm x??.jpg

    ls -l ./1.pdf

More details on yy025 and yy056 here:
https://news.ycombinator.com/item?id=27769701



HTTP/1.1 pipelining example: retrieving 100s of small files
textmode — Thu, 08 Jul 2021 09:06:37 +0000

A simple demonstration of HTTP/1.1 pipelining with netcat.
Retrieving 100s of small JPEG files with a single TCP connection. Here, 2 TCP connections for 141 files.
Splitting the output into image files and making a PDF from the images.
yy025 can be found here:
https://news.ycombinator.com/item?id=27178484
https://news.ycombinator.com/item?id=27180125
yy056 can be found here:
https://news.ycombinator.com/item?id=27769305
    # std utils: split, seq, od, tr, sed, flex
    # common utils: nc, xxd, imagemagick
    # simple utils made with flex: yy025, yy056

    (export Connection=keep-alive
    x1=http://www.minimizedistraction.com/img/vrg_google_doc_final_vrs03-
    x2(){ seq -f "$x1%g.jpg" $1 $2;};
    x3(){ yy025|nc -vvn 173.236.175.199 80;};
    x2   1 100|x3;
    x2 101 200|x3;
    )|exec yy056|exec od -An -tx1 -vw99999|exec tr -d '\40'|exec sed 's/ffd9ffd8/ffd9\
    ffd8/g'|exec sed -n /ffd8/p|exec split -l1;
    for x in x??;do xxd -p -r < $x > $x.jpg;rm $x;done;
    convert x??.jpg 1.pdf 2>/dev/null;rm x??.jpg

    firefox ./1.pdf

Note: The 82nd image is apparently corrupted which you will notice on page 82 of the PDF.  However, this has no effect on readability of the slide.

Comments URL: https://news.ycombinator.com/item?id=27769701
Points: 2
# Comments: 0



JPEG File Extraction from stdin
textmode — Thu, 08 Jul 2021 07:49:08 +0000

      cat > 056.l
 
        /* JPEG file carver */
    
     #define echo ECHO
     #define jmp BEGIN
     int fileno(FILE *);
    
    xa "\xFF\xD8"
    xb "\xFF\xD9" 
    
    %s xa 
    %option noyywrap nounput noinput
    %%
    
    {xa} echo;jmp xa;
    {xb} echo;jmp 0;
    .|\n|\r echo;
    .|\n
    
    %%
    int main(){ yylex();exit(0) ;}
    
    ^D

    flex 056.l
    cc -static -o yy056 lex.yy.c


Comments URL: https://news.ycombinator.com/item?id=27769305
Points: 1
# Comments: 0



New comment by textmode in "Althttpd: Simple webserver in a single C file"
textmode — Sun, 13 Jun 2021 06:55:26 +0000

https://news.ycombinator.com/item?id=27490265 <-- yy054
The "gibberish" is GZIP compressed data.  "yy054" is a simple filter I wrote to extract a GZIP file from stdin, i.e., discard leading and trailing garabage.  As far as I can tell, the compressed file "ee.txt" is not chunked transfer encoded. If it was chunked we would first extract the GZIP, then decompress and finally process the chunks (e.g., filter out the chunk sizes with the filter submitted in the OP).
In this case all we need to do is extract the GZIP file "ee.txt" from stdin, then decompress it:
    printf "GET /ee.txt\r\nHost: stuff-storage.sfo3.digitaloceanspaces.com\r\nConnection: close\r\n\r\n"|openssl s_client -connect 138.68.34.161:443 -quiet|yy054|gzip -dc > 1.htm
    firefox ./1.htm
   
Hope this helps.  Apologies I initially guessed wrong on here doc. I was not sure what was meant by "gibberish".  Looks like the here doc is working fine.



Gzip File Extraction from stdin
textmode — Sun, 13 Jun 2021 06:25:37 +0000

Note only extracts the first GZIP file.  How to decompress multiple, consecutive GZIP files on stdin from the network (HTTP/1.1 pipelining)
    cat > 054.l;    
     int fileno(FILE *);
     #define jmp BEGIN
     #define echo ECHO
    
    xa "\x1f\x8b"    
     /* GZIP magic number */
    
    %s xa
    %option noyywrap noinput nounput
    %%
    {xa} echo;jmp xa;
    .|\n echo;
    \n|.
    %%
    int main(){ yylex();exit(0);}
    ^D

    flex -8Crf 054.l
    cc -pipe -static lex.yy.c -o yy054

    example usage:
    printf 'GET /file.gz\r\nHost: example.com\r\nConnection: close\r\n\r\n" |nc -vv example.com 80|yy054|zcat


Comments URL: https://news.ycombinator.com/item?id=27490265
Points: 3
# Comments: 1



New comment by textmode in "Althttpd: Simple webserver in a single C file"
textmode — Sat, 12 Jun 2021 10:40:43 +0000

Need to get rid of the leading spaces on all lines except the  "int fileno" line.  Can also forgo the "here doc" and just save the lines between "flex" and "eof" to a file.  Run flex on that file.  This will create lex.yy.c.  Then compile lex.yy.c.
The compiled program is only useful for filtering chunked transfer encoding on stdin.  Most "HTTP clients" like wget or curl already take care of processing chunked transfer encoding.  It is when working with something like netcat that chunked tranfser encoding becomes "DIY".  This is a simple program that attempts to solve that problem.  It could be written by hand without using flex.



New comment by textmode in "Althttpd: Simple webserver in a single C file"
textmode — Wed, 09 Jun 2021 15:41:02 +0000

The extra "a" is a typo but would have no effect.  The "i" is also superfluous but harmless.  Without more details on the "gibberish" it is difficult to guess what happened.  The space before "int fileno (FILE *);" is required. All the other lines must be left-justified, no leading spaces, except the line with "int main()" which can be indented if desired.



New comment by textmode in "Althttpd: Simple webserver in a single C file"
textmode — Tue, 08 Jun 2021 11:18:08 +0000

I make most HTTP requests using netcat or similar tcp clients so I write filters that read from stdin.  Reading text files with the chunk sizes in hex interspersed is generally easy.  Sometimes I do not even bother to remove the chunk sizes. Where it becomes an issue is when it breaks URLs.  Here is a simple chunked transfer decoder that reads from stdin and removes the chunk sizes.
   flex -8iCrfa <

ExampleYahoo! serves chunked pages
   printf 'GET / HTTP/1.1\r\nHost: us.yahoo.com\r\nConnection: close\r\n\r\n'|openssl s_client -connect us.yahoo.com:443 -ign_eof|./yy045




New comment by textmode in "Making HTTP clients for use with netcat-like programs, part 2 of 2"
textmode — Sun, 30 May 2021 23:39:18 +0000

Corrections:
/int main/{s/input();//;s/return 0/exit(0)/;};/int yywrap/s/return 0/exit(0)/;/%option/s/$/ noinput/