#!/bin/sh

# unob is a de-obfuscating, layout script.
# Prior to tac, scripting the layout of C code was intractable.
#
# There are uncountably many possible refinements to this script.
# Enjoy the madness of getting a perfect layout of C code via
# an automated process.  Or just enjoy a significant help in
# reading obfuscated C code!

: ${PROG:=tac}  # PROG=tac ./unob.sh prog.c
: ${KW:=c11}
: ${DEF:=}         # DEF="-DW=\\\"kandr2\\\" -DU=\\\"usage\ string\\\"" ./unob.sh -p prog.c

if ! test -z $DEBUG; then
	DEBUG="-v DEBUG=$DEBUG"
fi

# optionally preprocess up to 3 times to handle #ifdef X #define Y
# unob prog.c			# nonce
# unob -p prog.c		# once
# unob -p -p prog.c		# twice
# unob -p -p -p prog.c		# thrice
# unob -p -p -p -p prog.c	# yer out

# Allow increasing the width before folding; -w must be first arg
WIDTH=100
pat="x-w.*"
if [[ "x$1" =~ $pat ]]; then
    if [ "x$1" == "x-w" ]; then
	WIDTH=$2
	shift 2
    else
	WIDTH=$(echo $1 | sed 's/^..//')
	shift
    fi
fi

# echo WIDTH=$WIDTH
# echo $*
# exit

function pp {
    cpp2=cat
    cpp3=cat
    if [ "x$1" == "x-p" ]; then
	shift
	if [ "x$1" == "x-p" ]; then
	    cpp2="cc -E -"
	    shift
	fi
	if [ "x$1" == "x-p" ]; then
	    cpp3="cc -E -"
	    shift
	fi
	cat $* |
	  sed 's/# *include/@include/' |
	  eval cc -trigraphs -E "$DEF" - |
	  eval $cpp2 |
	  eval $cpp3 |
	  sed 's/^#[^#].*$//' |
	  sed 's/@inc/#inc/'
    else
	cat $*
    fi
}

script='
BEGIN {
	WIDTH="'$WIDTH'"
	if (WIDTH+0 < 50)       WIDTH=80
	else if (WIDTH+0 > 200) WIDTH=200

	last=nl="\n"
	f="'$KW'"; while(getline <f > 0) ++kw[$0]; close(f);
	if (length(kw) == 0) {
		print "cannot find",f "; check KW environment variable" | "cat 1>&2"
		exit
	}

	++typ["FILE"]
	++typ["char"]
	++typ["short"]
	++typ["int"]
	++typ["long"]
	++typ["unsigned"]
	++typ["float"]
	++typ["double"]
	++typ["void"]
}

function istyp(a)  { return a in typ }
function iskw(a)   { return a != "*" && a in kw }
function indent()  { return sprintf("%.*s", n*3, "                              ") }
function newline() { if (!(infor||incomp) && last != nl) printf last=nl }
function show(a) {
	WID = WIDTH - 20
	if (len>WIDTH || (len>WID && (last ~ /^;/ || (last ~ /^,/ && paren==0))))
		printf "%c%s", last=nl, indent()	# yes, double indent here
	if (len>WID && last ~ /^\)/ && a !~ /^[\);,]/ && paren == 0)
		printf last=nl
	if (last==nl) { len=0; printf "%s", indent() }

	if (length(a) > WIDTH && a ~ /\"/) {
		newline()
		len = WIDTH
		max = length(a)
		for (off=0; off <= max; off += len)
			printf "%s%s%c%s",
				substr(a, off, len),
				(off+len > max) ? "" : "\\",
				last=nl, indent()
		len = 0
		return
	}

	x = space() a
	printf "%s", x
	len += length(x)
	prev=last
	last=a
}

function why(a) {
	if (DEBUG > 0) return " " a " "
	return " "
}

# XXX create combined tokens (not standard C):
# ::, -~, ~-
# careful, sometimes - ~ is correct: x[-~3] vs x - ~3

function space() {
	# snug up ; against non-{ ; }
	if ($0 == ";" && last !~ /;\{\}/)
		return ""
	# bitwise-and or address-of
	if (prev ~ /[A-Za-z0-9_\]\)]/ && last == "&" && $0 ~ /[A-Za-z0-9_]/)
		return why("X1X")
	else if (last == "&" && $0 ~ /[A-Za-z0-9_]/)
		return ""
	# indirection or multiply
	if (last == "*" && prev ~ /[A-Za-z0-9_\)\]]/)
		if (istyp(prev)) return ""
		else             return why("X1.5X");
	# function argument prototypes: A(F), casts (int)F {
	if ((istyp(last) && $0 == ")") || (last == ")" && $0 == "("))
		return "";
	# spaces are always preferred here
	if (iskw(last) || last == ",")
		return why("X2X")
	# try to fix -~n and ~-n when not expressions
	if (prev ~ /[<>?:\~\+\-\*\/%=\[&|^]/) {
		if (last == "-" && $0 ~ /[A-Za-z0-9_]/)
			return ""
		if (last == "-" && $0 == "~")
			return ""
	}
	# fix i.d and 1.23
	if (last == "." && $0 ~ /[A-Za-z0-9_]/)
		return ""
	if (last ~ /[)\]\[]/ && $0 == ")")
		return ""
	# if (last ~ /[A-Za-z0-9_]$/ && $0 !~ /[\.:;()\[\],]|\+\+|--/)
	  if (last ~ /[A-Za-z0-9_]/ && $0 !~ /[\.;()\[\],]|\+\+|--/)
		return why("X3X")
	if (last !~ /[A-Za-z0-9_(\n!\~\*]$/ && $0 == "(")
		return why("X4X")
	# if (last ~ /^[+-\/%^|&\]\)=:<>;]$|.=|<<|>>|&&|\|\|/ && $0 !~ /[\.:;\[\],]/)
	  if (last ~ /^[+-\/%^|&\]\)=:<>;]$|.=|<<|>>|&&|\|\|/ && $0 !~ /[\.;\[\],]/)
		return why("X5X")
	# doesnt catch anything on tac.c!!
	if (prev ~ /[A-Za-z0-9_]/ && !iskw(prev) && last ~ /\*/ && $0 !~ /[-+\.\*:;\)\(\[\],]/)
		return why("X6X")
	if (last ~ /\+\+|--/ && $0 !~ /[A-Za-z0-9_;\],\)\*]/)
		return why("X7X")
	return ""
}

/^\(/   { ++paren }
/^\)/   { --paren }

$0 == "typedef" { istype=1 }
istype && !iskw($0) { istype=0; ++kw[$0]; ++typ[$0]; }

# /^(if|while|for|return|switch|case|default|union|struct|typedef|#)/   { newline() }
iskw($0) && !istyp($0) { newline() }

istyp($0) { decl=1; }

/^while$/	    { show($0); ++incomp; next }
/^if$/		    { show($0); ++incomp; next }
incomp && paren==0  { --incomp; stmt=1; show($0); next }

/^for$/		    { newline(); infor=1 }
infor && /^;/	    { ++infor; show($0 " "); next }
infor==1 && /^:/    { ++infor }	# C++ ?
# infor>1 && paren==0 { infor=0 }
infor>1 && paren==0 { infor=0; stmt=1; show($0); next }
stmt==1 && /^{/     { stmt=0 }
stmt==1             { stmt=0; ++ti; ++n; newline(); show("") }

/^\?/		{ ++tern; ++n; newline(); show($0 " "); next }
tern && /^:/	{ newline(); show($0); --tern; --n; next }
tern && /^;/	{ newline(); show($0); tern = 0; --n; next }

# XXX need to track brace level apart from ti

/^#/	{ newline(); show($0); newline(); next }
/^;/	{ decl=0; show($0); while (ti--) --n; ti=0; newline(); next }
/^{/	{ show($0); ++n; newline(); next }
/^}/	{ --n; newline(); show($0); newline(); next }

# track distance between comma items, so that a,b,c is not broken, but a+=3,b-=2,c=a+b is
decl && paren==0 && /,/ { show($0); next }
paren==0 && /^,/ && len-comma > 3 { show($0); newline(); if (tern) for(i=0;i<tern;++i) show(" "); comma=len; next }
paren==0 && /^,/ { comma=len }

{ show($0) }
'

pp $* | tac -t | awk $DEBUG "$script" | sed 's/"[ \t]*"//g;s/##//g'
