diff options
-rw-r--r-- | Sema/Sema.h | 8 | ||||
-rw-r--r-- | Sema/SemaChecking.cpp | 292 | ||||
-rw-r--r-- | Sema/SemaExpr.cpp | 2 | ||||
-rw-r--r-- | include/clang/Basic/DiagnosticKinds.def | 20 | ||||
-rw-r--r-- | test/Sema/format-strings.c | 41 |
5 files changed, 329 insertions, 34 deletions
diff --git a/Sema/Sema.h b/Sema/Sema.h index d373cc0400..22fffc7299 100644 --- a/Sema/Sema.h +++ b/Sema/Sema.h @@ -423,10 +423,14 @@ private: // Extra semantic analysis beyond the C type system private: - void CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl, + void CheckFunctionCall(Expr *Fn, + SourceLocation LParenLoc, SourceLocation RParenLoc, + FunctionDecl *FDecl, Expr** Args, unsigned NumArgsInCall); - void CheckPrintfArguments(Expr *Fn, unsigned id_idx, FunctionDecl *FDecl, + void CheckPrintfArguments(Expr *Fn, + SourceLocation LParenLoc, SourceLocation RParenLoc, + bool HasVAListArg, FunctionDecl *FDecl, unsigned format_idx, Expr** Args, unsigned NumArgsInCall); }; diff --git a/Sema/SemaChecking.cpp b/Sema/SemaChecking.cpp index 8cc3c6c147..511f56f2a8 100644 --- a/Sema/SemaChecking.cpp +++ b/Sema/SemaChecking.cpp @@ -29,7 +29,9 @@ using namespace clang; /// CheckFunctionCall - Check a direct function call for various correctness /// and safety properties not strictly enforced by the C type system. void -Sema::CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl, +Sema::CheckFunctionCall(Expr *Fn, + SourceLocation LParenLoc, SourceLocation RParenLoc, + FunctionDecl *FDecl, Expr** Args, unsigned NumArgsInCall) { // Get the IdentifierInfo* for the called function. @@ -37,55 +39,287 @@ Sema::CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl, // Search the KnownFunctionIDs for the identifier. unsigned i = 0, e = id_num_known_functions; - for ( ; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; } - if( i == e ) return; + for (; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; } + if (i == e) return; // Printf checking. if (i <= id_vprintf) { - // Retrieve the index of the format string parameter. + // Retrieve the index of the format string parameter and determine + // if the function is passed a va_arg argument. unsigned format_idx = 0; + bool HasVAListArg = false; + switch (i) { default: assert(false && "No format string argument index."); case id_printf: format_idx = 0; break; case id_fprintf: format_idx = 1; break; case id_sprintf: format_idx = 1; break; case id_snprintf: format_idx = 2; break; - case id_vsnprintf: format_idx = 2; break; - case id_asprintf: format_idx = 1; break; - case id_vasprintf: format_idx = 1; break; - case id_vfprintf: format_idx = 1; break; - case id_vsprintf: format_idx = 1; break; - case id_vprintf: format_idx = 1; break; - } - CheckPrintfArguments(Fn, i, FDecl, format_idx, Args, NumArgsInCall); + case id_asprintf: format_idx = 1; HasVAListArg = true; break; + case id_vsnprintf: format_idx = 2; HasVAListArg = true; break; + case id_vasprintf: format_idx = 1; HasVAListArg = true; break; + case id_vfprintf: format_idx = 1; HasVAListArg = true; break; + case id_vsprintf: format_idx = 1; HasVAListArg = true; break; + case id_vprintf: format_idx = 0; HasVAListArg = true; break; + } + + CheckPrintfArguments(Fn, LParenLoc, RParenLoc, HasVAListArg, + FDecl, format_idx, Args, NumArgsInCall); } } /// CheckPrintfArguments - Check calls to printf (and similar functions) for -/// correct use of format strings. Improper format strings to functions in -/// the printf family can be the source of bizarre bugs and very serious -/// security holes. A good source of information is available in the following -/// paper (which includes additional references): +/// correct use of format strings. +/// +/// HasVAListArg - A predicate indicating whether the printf-like +/// function is passed an explicit va_arg argument (e.g., vprintf) +/// +/// format_idx - The index into Args for the format string. +/// +/// Improper format strings to functions in the printf family can be +/// the source of bizarre bugs and very serious security holes. A +/// good source of information is available in the following paper +/// (which includes additional references): /// /// FormatGuard: Automatic Protection From printf Format String /// Vulnerabilities, Proceedings of the 10th USENIX Security Symposium, 2001. +/// +/// Functionality implemented: +/// +/// We can statically check the following properties for string +/// literal format strings for non v.*printf functions (where the +/// arguments are passed directly): +// +/// (1) Are the number of format conversions equal to the number of +/// data arguments? +/// +/// (2) Does each format conversion correctly match the type of the +/// corresponding data argument? (TODO) +/// +/// Moreover, for all printf functions we can: +/// +/// (3) Check for a missing format string (when not caught by type checking). +/// +/// (4) Check for no-operation flags; e.g. using "#" with format +/// conversion 'c' (TODO) +/// +/// (5) Check the use of '%n', a major source of security holes. +/// +/// (6) Check for malformed format conversions that don't specify anything. +/// +/// (7) Check for empty format strings. e.g: printf(""); +/// +/// (8) Check that the format string is a wide literal. +/// +/// All of these checks can be done by parsing the format string. +/// +/// For now, we ONLY do (1), (3), (5), (6), (7), and (8). void -Sema::CheckPrintfArguments(Expr *Fn, unsigned id_idx, FunctionDecl *FDecl, +Sema::CheckPrintfArguments(Expr *Fn, + SourceLocation LParenLoc, SourceLocation RParenLoc, + bool HasVAListArg, FunctionDecl *FDecl, unsigned format_idx, Expr** Args, unsigned NumArgsInCall) { - - assert( format_idx < NumArgsInCall ); - + // CHECK: printf-like function is called with no format string. + if (format_idx >= NumArgsInCall) { + Diag(RParenLoc, diag::warn_printf_missing_format_string, + Fn->getSourceRange()); + return; + } + // CHECK: format string is not a string literal. // - // Dynamically generated format strings are difficult to automatically - // vet at compile time. Requiring that format strings are string literals - // (1) permits the checking of format strings by the compiler and thereby - // (2) can practically remove the source of many format string exploits. - + // Dynamically generated format strings are difficult to + // automatically vet at compile time. Requiring that format strings + // are string literals: (1) permits the checking of format strings by + // the compiler and thereby (2) can practically remove the source of + // many format string exploits. StringLiteral *FExpr = dyn_cast<StringLiteral>(Args[format_idx]); - if ( FExpr == NULL ) - Diag( Args[format_idx]->getLocStart(), - diag::warn_printf_not_string_constant, Fn->getSourceRange() ); -}
\ No newline at end of file + if (FExpr == NULL) { + Diag(Args[format_idx]->getLocStart(), + diag::warn_printf_not_string_constant, Fn->getSourceRange()); + return; + } + + // CHECK: is the format string a wide literal? + if (FExpr->isWide()) { + Diag(Args[format_idx]->getLocStart(), + diag::warn_printf_format_string_is_wide_literal, + Fn->getSourceRange()); + return; + } + + // Str - The format string. NOTE: this is NOT null-terminated! + const char * const Str = FExpr->getStrData(); + + // CHECK: empty format string? + const unsigned StrLen = FExpr->getByteLength(); + + if (StrLen == 0) { + Diag(Args[format_idx]->getLocStart(), + diag::warn_printf_empty_format_string, Fn->getSourceRange()); + return; + } + + // We process the format string using a binary state machine. The + // current state is stored in CurrentState. + enum { + state_OrdChr, + state_Conversion + } CurrentState = state_OrdChr; + + // numConversions - The number of conversions seen so far. This is + // incremented as we traverse the format string. + unsigned numConversions = 0; + + // numDataArgs - The number of data arguments after the format + // string. This can only be determined for non vprintf-like + // functions. For those functions, this value is 1 (the sole + // va_arg argument). + unsigned numDataArgs = NumArgsInCall-(format_idx+1); + + // Inspect the format string. + unsigned StrIdx = 0; + + // LastConversionIdx - Index within the format string where we last saw + // a '%' character that starts a new format conversion. + unsigned LastConversionIdx = 0; + + for ( ; StrIdx < StrLen ; ++StrIdx ) { + + // Is the number of detected conversion conversions greater than + // the number of matching data arguments? If so, stop. + if (!HasVAListArg && numConversions > numDataArgs) break; + + // Handle "\0" + if(Str[StrIdx] == '\0' ) { + // The string returned by getStrData() is not null-terminated, + // so the presence of a null character is likely an error. + + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),StrIdx+1); + + Diag(Loc, diag::warn_printf_format_string_contains_null_char, + Fn->getSourceRange()); + + return; + } + + // Ordinary characters (not processing a format conversion). + if (CurrentState == state_OrdChr) { + if (Str[StrIdx] == '%') { + CurrentState = state_Conversion; + LastConversionIdx = StrIdx; + } + continue; + } + + // Seen '%'. Now processing a format conversion. + switch (Str[StrIdx]) { + // Characters which can terminate a format conversion + // (e.g. "%d"). Characters that specify length modifiers or + // other flags are handled by the default case below. + // + // TODO: additional checks will go into the following cases. + case 'i': + case 'd': + case 'o': + case 'u': + case 'x': + case 'X': + case 'D': + case 'O': + case 'U': + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case 'a': + case 'A': + case 'c': + case 'C': + case 'S': + case 's': + case 'P': + ++numConversions; + CurrentState = state_OrdChr; + break; + + // CHECK: Are we using "%n"? Issue a warning. + case 'n': { + ++numConversions; + CurrentState = state_OrdChr; + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(), + LastConversionIdx+1); + + Diag(Loc, diag::warn_printf_write_back, Fn->getSourceRange()); + break; + } + + // Handle "%%" + case '%': + // Sanity check: Was the first "%" character the previous one? + // If not, we will assume that we have a malformed format + // conversion, and that the current "%" character is the start + // of a new conversion. + if (StrIdx - LastConversionIdx == 1) + CurrentState = state_OrdChr; + else { + // Issue a warning: invalid format conversion. + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(), + LastConversionIdx+1); + + Diag(Loc, diag::warn_printf_invalid_conversion, + std::string(Str+LastConversionIdx, Str+StrIdx), + Fn->getSourceRange()); + + // This conversion is broken. Advance to the next format + // conversion. + LastConversionIdx = StrIdx; + ++numConversions; + } + + break; + + default: + // This case catches all other characters: flags, widths, etc. + // We should eventually process those as well. + break; + } + } + + if (CurrentState == state_Conversion) { + // Issue a warning: invalid format conversion. + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(), + LastConversionIdx+1); + + Diag(Loc, diag::warn_printf_invalid_conversion, + std::string(Str+LastConversionIdx, Str+StrIdx), + Fn->getSourceRange()); + return; + } + + if (!HasVAListArg) { + // CHECK: Does the number of format conversions exceed the number + // of data arguments? + if (numConversions > numDataArgs) { + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(), + LastConversionIdx); + + Diag(Loc, diag::warn_printf_insufficient_data_args, + Fn->getSourceRange()); + } + // CHECK: Does the number of data arguments exceed the number of + // format conversions in the format string? + else if (numConversions < numDataArgs) + Diag(Args[format_idx+numConversions+1]->getLocStart(), + diag::warn_printf_too_many_data_args, Fn->getSourceRange()); + } +} diff --git a/Sema/SemaExpr.cpp b/Sema/SemaExpr.cpp index 6b871b4812..f69867fdf1 100644 --- a/Sema/SemaExpr.cpp +++ b/Sema/SemaExpr.cpp @@ -561,7 +561,7 @@ ParseCallExpr(ExprTy *fn, SourceLocation LParenLoc, if (ImplicitCastExpr *IcExpr = dyn_cast<ImplicitCastExpr>(Fn)) if (DeclRefExpr *DRExpr = dyn_cast<DeclRefExpr>(IcExpr->getSubExpr())) if (FunctionDecl *FDecl = dyn_cast<FunctionDecl>(DRExpr->getDecl())) - CheckFunctionCall(Fn, FDecl, Args, NumArgsInCall); + CheckFunctionCall(Fn, LParenLoc, RParenLoc, FDecl, Args, NumArgsInCall); return new CallExpr(Fn, Args, NumArgsInCall, resultType, RParenLoc); } diff --git a/include/clang/Basic/DiagnosticKinds.def b/include/clang/Basic/DiagnosticKinds.def index 041f5537dd..9137a8419f 100644 --- a/include/clang/Basic/DiagnosticKinds.def +++ b/include/clang/Basic/DiagnosticKinds.def @@ -661,8 +661,24 @@ DIAG(warn_unused_expr, WARNING, // Printf checking DIAG(warn_printf_not_string_constant, WARNING, - "format string is not a string literal") - + "format string is not a string literal (potentially insecure)") +DIAG(warn_printf_write_back, WARNING, + "use of '%n' in format string discouraged (potentially insecure)") +DIAG(warn_printf_insufficient_data_args, WARNING, + "more '%' conversions than data arguments") +DIAG(warn_printf_too_many_data_args, WARNING, + "more data arguments than '%' conversions") +DIAG(warn_printf_invalid_conversion, WARNING, + "invalid conversion '%0'") +DIAG(warn_printf_missing_format_string, WARNING, + "format string missing") +DIAG(warn_printf_empty_format_string, WARNING, + "format string is empty") +DIAG(warn_printf_format_string_is_wide_literal, WARNING, + "format string should not be a wide string") +DIAG(warn_printf_format_string_contains_null_char, WARNING, + "format string contains '\\0' within the string body") + // Statements. DIAG(err_continue_not_in_loop, ERROR, "'continue' statement not in loop statement") diff --git a/test/Sema/format-strings.c b/test/Sema/format-strings.c index f71cd58645..403da07b0f 100644 --- a/test/Sema/format-strings.c +++ b/test/Sema/format-strings.c @@ -21,3 +21,44 @@ void check_string_literal( FILE* fp, const char* s, char *buf, ... ) { vsnprintf(buf,2,s,ap); // expected-warning {{mat string is not a string lit}} } +void check_writeback_specifier() +{ + int x; + char *b; + + printf("%n",&x); // expected-warning {{'%n' in format string discouraged}} + sprintf(b,"%d%%%n",1, &x); // expected-warning {{'%n' in format string dis}} +} + +void check_invalid_specifier(FILE* fp, char *buf) +{ + printf("%s%lb%d","unix",10,20); // expected-warning {{lid conversion '%lb'}} + fprintf(fp,"%%%l"); // expected-warning {{lid conversion '%l'}} + sprintf(buf,"%%%%%ld%d%d", 1, 2, 3); // no-warning + snprintf(buf, 2, "%%%%%ld%;%d", 1, 2, 3); // expected-warning {{sion '%;'}} +} + +void check_null_char_string(char* b) +{ + printf("\0this is bogus%d",1); // expected-warning {{string contains '\0'}} + snprintf(b,10,"%%%%%d\0%d",1,2); // expected-warning {{string contains '\0'}} + printf("%\0d",1); // expected-warning {{string contains '\0'}} +} + +void check_empty_format_string(char* buf) +{ + va_list ap; + va_start(ap,buf); + vprintf("",ap); // expected-warning {{format string is empty}} + sprintf(buf,""); // expected-warning {{format string is empty}} +} + +void check_wide_string() +{ + char *b; + va_list ap; + va_start(ap,b); + + printf(L"foo %d",2); // expected-warning {{should not be a wide string}} + vasprintf(&b,L"bar %d",2); // expected-warning {{should not be a wide string}} +} |