diff --git a/src/abstract-interpretation/absint-visitor.ts b/src/abstract-interpretation/absint-visitor.ts new file mode 100644 index 00000000000..158a49a7cec --- /dev/null +++ b/src/abstract-interpretation/absint-visitor.ts @@ -0,0 +1,337 @@ +import type { CfgSimpleVertex, ControlFlowInformation } from '../control-flow/control-flow-graph'; +import { CfgVertexType, getVertexRootId } from '../control-flow/control-flow-graph'; +import type { SemanticCfgGuidedVisitorConfiguration } from '../control-flow/semantic-cfg-guided-visitor'; +import { SemanticCfgGuidedVisitor } from '../control-flow/semantic-cfg-guided-visitor'; +import type { DataflowGraph } from '../dataflow/graph/graph'; +import { type DataflowGraphVertexFunctionCall, type DataflowGraphVertexVariableDefinition, VertexType } from '../dataflow/graph/vertex'; +import { getOriginInDfg, OriginType } from '../dataflow/origin/dfg-get-origin'; +import type { NoInfo, RNode } from '../r-bridge/lang-4.x/ast/model/model'; +import { EmptyArgument } from '../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { NormalizedAst, ParentInformation } from '../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { NodeId } from '../r-bridge/lang-4.x/ast/model/processing/node-id'; +import { RType } from '../r-bridge/lang-4.x/ast/model/type'; +import { guard, isNotUndefined } from '../util/assert'; +import { AbstractDomain, type AnyAbstractDomain } from './domains/abstract-domain'; +import type { StateAbstractDomain } from './domains/state-abstract-domain'; + +export interface AbsintVisitorConfiguration +extends Omit>, 'defaultVisitingOrder' | 'defaultVisitingType'> { + readonly domain: StateAbstractDomain; +} + +/** + * A control flow graph visitor to perform abstract interpretation. + */ +export abstract class AbstractInterpretationVisitor = AbsintVisitorConfiguration> + extends SemanticCfgGuidedVisitor, DataflowGraph, Config & { defaultVisitingOrder: 'forward', defaultVisitingType: 'exit' }> { + /** + * The state of the abstract interpretation visitor mapping node IDs to the abstract state at the respective node. + */ + protected readonly state: Map> = new Map(); + + /** + * A set of nodes representing variable definitions that have already been visited but whose assignment has not yet been processed. + */ + private readonly unassigned: Set = new Set(); + + /** + * The current state domain at the currently processed AST node. + */ + private currentDomain: StateAbstractDomain; + + constructor(config: Config) { + super({ ...config, defaultVisitingOrder: 'forward', defaultVisitingType: 'exit' }); + + this.currentDomain = config.domain.bottom(); + } + + /** + * Resolves the inferred abstract value of an AST node. + * This requires that the abstract interpretation visitor has been completed, or at least started. + * @param id - The ID of the node to get the inferred value for + * @param domain - An optional state abstract domain used to resolve the inferred abstract value (defaults to the state at the requested node) + * @returns The inferred abstract value of the node, or `undefined` if no value was inferred for the node + */ + public getValue(id: RNode | NodeId | undefined, domain?: StateAbstractDomain): Domain | undefined { + const node = (id === undefined || typeof id === 'object') ? id : this.getNormalizedAst(id); + domain ??= node !== undefined ? this.getState(node.info.id) : undefined; + + if(node === undefined) { + return; + } else if(domain?.has(node.info.id)) { + return domain.get(node.info.id); + } + const vertex = this.getDataflowGraph(node.info.id); + const call = vertex?.tag === VertexType.FunctionCall ? vertex : undefined; + const origins = Array.isArray(call?.origin) ? call.origin : []; + + if(node.type === RType.Symbol) { + const values = this.getVariableOrigins(node.info.id).map(origin => domain?.get(origin)); + + if(values.length > 0 && values.every(isNotUndefined)) { + return AbstractDomain.joinAll(values); + } + } else if(node.type === RType.Argument && node.value !== undefined) { + return this.getValue(node.value, domain); + } else if(node.type === RType.ExpressionList && node.children.length > 0) { + return this.getValue(node.children[node.children.length - 1], domain); + } else if(node.type === RType.Pipe) { + return this.getValue(node.rhs, domain); + } else if(origins.includes('builtin:pipe')) { + if(node.type === RType.BinaryOp) { + return this.getValue(node.rhs, domain); + } else if(call?.args.length === 2 && call?.args[1] !== EmptyArgument) { + return this.getValue(call.args[1].nodeId, domain); + } + } else if(node.type === RType.IfThenElse) { + if(node.otherwise !== undefined) { + const values = [node.then, node.otherwise].map(entry => this.getValue(entry, domain)); + + if(values.length > 0 && values.every(isNotUndefined)) { + return AbstractDomain.joinAll(values); + } + } + } else if(origins.includes('builtin:if-then-else') && call?.args.every(arg => arg !== EmptyArgument)) { + if(call.args.length === 3) { + const values = call.args.slice(1, 3).map(entry => this.getValue(entry.nodeId, domain)); + + if(values.length > 0 && values.every(isNotUndefined)) { + return AbstractDomain.joinAll(values); + } + } + } + } + + /** + * Gets the inferred abstract state at the location of a specific AST node. + * This requires that the abstract interpretation visitor has been completed, or at least started. + * @param id - The ID of the node to get the abstract state at + * @returns The abstract state at the node, or `undefined` if the node has no abstract state (i.e. the node has not been visited or is unreachable). + */ + public getState(id: NodeId | undefined): StateAbstractDomain | undefined { + return id !== undefined ? this.state.get(id) : undefined; + } + + /** + * Gets the inferred abstract state at the end of the program (exit nodes of the control flow graph). + * This requires that the abstract interpretation visitor has been completed, or at least started. + * @returns The inferred abstract state at the end of the program + */ + public getResult(): StateAbstractDomain { + const exitPoints = this.config.controlFlow.exitPoints.map(id => this.getCfgVertex(id)).filter(isNotUndefined); + const exitNodes = exitPoints.map(vertex => getVertexRootId(vertex)).filter(isNotUndefined); + const domains = exitNodes.map(node => this.getState(node)).filter(isNotUndefined); + + return this.config.domain.bottom().joinAll(domains); + } + + public override start(): void { + guard(this.state.size === 0, 'Abstract interpretation visitor has already been started'); + super.start(); + this.unassigned.clear(); + } + + protected override visitNode(vertexId: NodeId): boolean { + const vertex = this.getCfgVertex(vertexId); + + if(vertex === undefined) { + return true; + } + const nodeId = getVertexRootId(vertex); + + if(this.isWideningPoint(nodeId)) { + // only check widening points at the entry vertex + if(vertex.type === CfgVertexType.EndMarker) { + return true; + } + const oldDomain = this.getState(nodeId) ?? this.config.domain.bottom(); + const predecessorDomains = this.getPredecessorNodes(vertex.id).map(pred => this.getState(pred)).filter(isNotUndefined); + this.currentDomain = this.config.domain.bottom().joinAll(predecessorDomains); + + if(this.shouldWiden(vertex)) { + this.currentDomain = oldDomain.widen(this.currentDomain); + } + this.state.set(nodeId, this.currentDomain); + + const visitedCount = this.visited.get(vertex.id) ?? 0; + this.visited.set(vertex.id, visitedCount + 1); + + // only continue visiting if the widening point is visited for the first time or the abstract state at the widening point changed + return visitedCount === 0 || !oldDomain.equals(this.currentDomain); + } else if(this.shouldSkipVertex(vertex)) { + return true; + } + const predecessorDomains = this.getPredecessorNodes(vertex.id).map(pred => this.getState(pred)).filter(isNotUndefined); + this.currentDomain = this.config.domain.bottom().joinAll(predecessorDomains); + + this.onVisitNode(vertexId); + + // discard the inferred abstract state when encountering functions with unknown side effects (e.g. `eval`) + if(this.config.dfg.unknownSideEffects.has(nodeId)) { + this.currentDomain = this.currentDomain.bottom(); + } + this.state.set(nodeId, this.currentDomain); + + return true; + } + + protected override onVariableDefinition({ vertex }: { vertex: DataflowGraphVertexVariableDefinition; }): void { + if(this.getState(vertex.id) === undefined) { + this.unassigned.add(vertex.id); + } + } + + protected override onAssignmentCall({ target, source }: { call: DataflowGraphVertexFunctionCall, target?: NodeId, source?: NodeId }): void { + if(target === undefined || source === undefined) { + return; + } + const value = this.getValue(source); + this.unassigned.delete(target); + + if(value !== undefined) { + this.currentDomain.set(target, value); + this.state.set(target, this.currentDomain.create(this.currentDomain.value)); + } + } + + protected override onReplacementCall({ call, target, source }: { call: DataflowGraphVertexFunctionCall, target?: NodeId, source?: NodeId }): void { + if(source === undefined || target === undefined) { + return; + } + this.currentDomain = this.evalReplacementCall(call, target, source, this.currentDomain); + this.unassigned.delete(target); + } + + protected override onAccessCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalAccessCall(call, this.currentDomain); + } + + protected override onUnnamedCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onEvalFunctionCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onApplyFunctionCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onSourceCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onGetCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onRmCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onListCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onVectorCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onSpecialBinaryOpCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onQuoteCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onLibraryCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + protected override onDefaultFunctionCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { + this.currentDomain = this.evalFunctionCall(call, this.currentDomain); + } + + /** + * Evaluates any function call visited by the abstract interpretation visitor by applying the abstract semantics of the function call to the current abstract state. + * @param call - The data flow vertex of the function call to evaluate + * @param domain - The current abstract state before the evaluation of the function call + * @returns The abstract state after applying the abstract semantics of the function call + */ + protected abstract evalFunctionCall(call: DataflowGraphVertexFunctionCall, domain: StateAbstractDomain): StateAbstractDomain; + + /** + * Evaluates any replacement function call visited by the abstract interpretation visitor by applying the abstract semantics of the replacement call to the current abstract state (e.g. for `$<-`, `[<-`, `names<-`, ...). + * @param call - The data flow vertex of the replacement call to evaluate + * @param source - The node ID of the assignment target of the replacement call + * @param target - The node ID of the assigned expression of the replacement call + * @param domain - The current abstract state before the evaluation of the replacement call + * @returns The abstract state after applying the abstract semantics of the replacement call + */ + protected abstract evalReplacementCall(call: DataflowGraphVertexFunctionCall, target: NodeId, source: NodeId, domain: StateAbstractDomain): StateAbstractDomain; + + /** + * Evaluates any access operation call visited by the abstract interpretation visitor by applying the abstract semantics of the access operation to the current abstract stat (e.g. for `$`, `[`, `[[`, ...). + * @param call - The data flow vertex of the access operation to evaluate + * @param domain - The current abstract state before the evaluation of the access operation + * @returns The abstract state after applying the abstract semantics of the access operation + */ + protected abstract evalAccessCall(call: DataflowGraphVertexFunctionCall, domain: StateAbstractDomain): StateAbstractDomain; + + /** Gets all AST nodes for the predecessor vertices that are leaf nodes and exit vertices */ + protected getPredecessorNodes(vertexId: NodeId): NodeId[] { + return this.config.controlFlow.graph.outgoingEdges(vertexId)?.keys() // outgoing dependency edges are incoming CFG edges + .map(id => this.getCfgVertex(id)) + .flatMap(vertex => { + if(vertex === undefined) { + return []; + } else if(this.shouldSkipVertex(vertex)) { + return this.getPredecessorNodes(vertex.id); + } else { + return [getVertexRootId(vertex)]; + } + }) + .toArray() ?? []; + } + + /** Gets each variable origin that has already been visited and whose assignment has already been processed */ + protected getVariableOrigins(nodeId: NodeId): NodeId[] { + return getOriginInDfg(this.config.dfg, nodeId) + ?.filter(origin => origin.type === OriginType.ReadVariableOrigin) + .map(origin => origin.id) + .filter(origin => this.state.has(origin) && !this.unassigned.has(origin)) ?? []; + } + + /** We only perform widening at `for`, `while`, or `repeat` loops with more than one incoming CFG edge */ + protected isWideningPoint(nodeId: NodeId): boolean { + const incomingEdges = this.config.controlFlow.graph.outgoingEdges(nodeId)?.size; // outgoing dependency edges are incoming CFG edges + + if(incomingEdges === undefined || incomingEdges <= 1) { + return false; + } + const node = this.getNormalizedAst(nodeId); + + if(node?.type === RType.ForLoop || node?.type === RType.WhileLoop || node?.type === RType.RepeatLoop) { + return true; + } + const dataflowVertex = this.getDataflowGraph(nodeId); + + if(dataflowVertex?.tag !== VertexType.FunctionCall || !Array.isArray(dataflowVertex.origin)) { + return false; + } + const origin = dataflowVertex.origin; + + return origin.includes('builtin:for-loop') || origin.includes('builtin:while-loop') || origin.includes('builtin:repeat-loop'); + } + + /** We only process vertices of leaf nodes and exit vertices (no entry nodes of complex nodes) */ + protected shouldSkipVertex(vertex: CfgSimpleVertex): boolean { + return vertex.type !== CfgVertexType.EndMarker && vertex.end !== undefined; + } + + protected shouldWiden(vertex: CfgSimpleVertex): boolean { + return (this.visited.get(vertex.id) ?? 0) >= this.config.ctx.config.abstractInterpretation.wideningThreshold; + } +} diff --git a/src/abstract-interpretation/data-frame/absint-info.ts b/src/abstract-interpretation/data-frame/absint-info.ts deleted file mode 100644 index d5ae788799b..00000000000 --- a/src/abstract-interpretation/data-frame/absint-info.ts +++ /dev/null @@ -1,129 +0,0 @@ -import type { RNode } from '../../r-bridge/lang-4.x/ast/model/model'; -import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; -import type { DataFrameStateDomain } from './dataframe-domain'; -import type { ConstraintType, DataFrameOperationArgs, DataFrameOperationName, DataFrameOperationOptions } from './semantics'; - -/** - * An abstract data frame operation without additional options. - * - `operation` contains the type of the abstract operation (see {@link DataFrameOperationName}) - * - `operand` contains the ID of the data frame operand of the operation (may be `undefined`) - * - `...args` contains the arguments of the abstract operation (see {@link DataFrameOperationArgs}) - */ -export type DataFrameOperationType = { - [Name in OperationName]: { - operation: Name, - operand: NodeId | undefined - } & DataFrameOperationArgs -}[OperationName]; - -/** - * An abstract data frame operation. - * - `operation` contains the type of the abstract operation (see {@link DataFrameOperationName}) - * - `operand` contains the ID of the data frame operand of the operation (may be `undefined`) - * - `type` optionally contains the constraint type to overwrite the default type of the operation (see {@link ConstraintType}) - * - `options` optionally contains additional options for the abstract operation (see {@link DataFrameOperationOptions}) - * - `...args` contains the arguments of the abstract operation (see {@link DataFrameOperationArgs}) - */ -export type DataFrameOperation = { - [Name in OperationName]: { - operation: Name, - operand: NodeId | undefined, - type?: ConstraintType, - options?: DataFrameOperationOptions - } & DataFrameOperationArgs; -}[OperationName]; - -/** - * Represents the base data frame information stored in the abstract interpretation info of an AST node. - * - `type` optionally defines the type of the extra information stored in the data frame info - * - `domain` contains the abstract data frame shape state of the node - * This may not be present if the data frame shape inference has not been executed yet or the program contains no data frames - */ -interface DataFrameInfoBase { - type?: string, - domain?: DataFrameStateDomain -} - -/** Enum to mark nodes during the data frame shape inference */ -export enum DataFrameInfoMarker { - /** Marks the target symbol of assignments as "unassigned" until the assigned expression is evaluated */ - Unassigned = 'unassigned' -} - -/** - * Represents the data frame information for a node without extra data frame information, - * i.e. for all nodes that do not represent a data frame assignment or data frame operation (this is the default). - * - * The `marker` can be used to mark nodes during the data frame shape inference. - */ -interface DataFrameEmptyInfo extends DataFrameInfoBase { - type?: never, - marker?: DataFrameInfoMarker -} - -/** - * Represents the data frame information for a data frame assignment with a target identifier (symbol/string) and an assigned expression. - * This is used during data frame shape inference to mark assignments of data frame expressions to an identifier. - * - * Use {@link hasDataFrameAssignmentInfo} to check whether an AST node has attached data frame assignment information. - */ -export interface DataFrameAssignmentInfo extends DataFrameInfoBase { - type: 'assignment', - identifier: NodeId, - expression: NodeId -} - -/** - * Represents the data frame information for a data frame function/operation with mapped abstract operations. - * This is used during data frame shape inference to store the abstract operations a data frame function/operation is mapped to. - * - * The order of the abstract operations is the order in which their semantics are applied (for example, access operations are typically before other operations in the list). - * Moreover, abstract operations that take the result of previous abstract operation as data frame operand must have the `operand` set to `undefined`. - * - * Use {@link hasDataFrameExpressionInfo} to check whether an AST node has attached data frame expression information. - */ -export interface DataFrameExpressionInfo extends DataFrameInfoBase { - type: 'expression', - operations: DataFrameOperation[] -} - -/** - * Represents the data frame shape inference information stored in the abstract interpretation info of AST nodes. - */ -export type DataFrameInfo = DataFrameEmptyInfo | DataFrameAssignmentInfo | DataFrameExpressionInfo; - -/** - * Represents the abstract interpretation information attached to AST nodes. - */ -export interface AbstractInterpretationInfo { - dataFrame?: DataFrameInfo -} - -/** - * Checks whether an AST node has attached data frame assignment information. - */ -export function hasDataFrameAssignmentInfo( - node: RNode -): node is RNode { - return node.info.dataFrame?.type === 'assignment'; -} - -/** - * Checks whether an AST node has attached data frame expression information. - */ -export function hasDataFrameExpressionInfo( - node: RNode -): node is RNode { - return node.info.dataFrame?.type === 'expression'; -} - -/** - * Checks whether an AST node has an attached data frame info marker. - */ -export function hasDataFrameInfoMarker( - node: RNode, - marker: DataFrameInfoMarker -): boolean { - return node.info.dataFrame?.type === undefined && node.info.dataFrame?.marker === marker; -} diff --git a/src/abstract-interpretation/data-frame/absint-visitor.ts b/src/abstract-interpretation/data-frame/absint-visitor.ts deleted file mode 100644 index 3463c43acd9..00000000000 --- a/src/abstract-interpretation/data-frame/absint-visitor.ts +++ /dev/null @@ -1,231 +0,0 @@ -import { - type CfgBasicBlockVertex, - type CfgSimpleVertex, - CfgVertexType, - type ControlFlowInformation, - getVertexRootId, - isMarkerVertex -} from '../../control-flow/control-flow-graph'; -import { - SemanticCfgGuidedVisitor, - type SemanticCfgGuidedVisitorConfiguration -} from '../../control-flow/semantic-cfg-guided-visitor'; -import type { DataflowGraph } from '../../dataflow/graph/graph'; -import type { DataflowGraphVertexFunctionCall, DataflowGraphVertexVariableDefinition } from '../../dataflow/graph/vertex'; -import type { NoInfo, RNode } from '../../r-bridge/lang-4.x/ast/model/model'; -import type { NormalizedAst, ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; -import { isNotUndefined } from '../../util/assert'; -import { - type AbstractInterpretationInfo, - DataFrameInfoMarker, - hasDataFrameAssignmentInfo, - hasDataFrameExpressionInfo, - hasDataFrameInfoMarker -} from './absint-info'; -import { DataFrameDomain, DataFrameStateDomain } from './dataframe-domain'; -import { mapDataFrameAccess } from './mappers/access-mapper'; -import { isAssignmentTarget, mapDataFrameVariableAssignment } from './mappers/assignment-mapper'; -import { mapDataFrameFunctionCall } from './mappers/function-mapper'; -import { mapDataFrameReplacementFunction } from './mappers/replacement-mapper'; -import { applyDataFrameSemantics, ConstraintType, getConstraintType } from './semantics'; -import { getVariableOrigins, resolveIdToDataFrameShape } from './shape-inference'; - -export type DataFrameShapeInferenceVisitorConfiguration< - OtherInfo = NoInfo, - ControlFlow extends ControlFlowInformation = ControlFlowInformation, - Ast extends NormalizedAst = NormalizedAst, - Dfg extends DataflowGraph = DataflowGraph -> = Omit, 'defaultVisitingOrder' | 'defaultVisitingType'>; - -/** - * The control flow graph visitor to infer the shape of data frames using abstract interpretation - */ -export class DataFrameShapeInferenceVisitor< - OtherInfo = NoInfo, - ControlFlow extends ControlFlowInformation = ControlFlowInformation, - Ast extends NormalizedAst = NormalizedAst, - Dfg extends DataflowGraph = DataflowGraph, - Config extends DataFrameShapeInferenceVisitorConfiguration = DataFrameShapeInferenceVisitorConfiguration -> extends SemanticCfgGuidedVisitor { - /** - * The old domain of an AST node before processing the node retrieved from the attached {@link AbstractInterpretationInfo}. - * This is used to check whether the state has changed and successors should be visited again, and is also required for widening. - */ - private oldDomain = DataFrameStateDomain.bottom(); - /** - * The new domain of an AST node during and after processing the node. - * This information is stored in the {@link AbstractInterpretationInfo} afterward. - */ - private newDomain = DataFrameStateDomain.bottom(); - - constructor(config: Config) { - super({ ...config, defaultVisitingOrder: 'forward', defaultVisitingType: 'exit' }); - } - - protected override visitNode(nodeId: NodeId): boolean { - const vertex = this.getCfgVertex(nodeId); - - // skip vertices representing entries of complex nodes - if(vertex === undefined || this.shouldSkipVertex(vertex)) { - return true; - } - const predecessors = this.getPredecessorNodes(vertex.id); - const predecessorDomains = predecessors.map(node => node.info.dataFrame?.domain).filter(isNotUndefined); - this.newDomain = DataFrameStateDomain.bottom().joinAll(predecessorDomains); - this.onVisitNode(nodeId); - - const visitedCount = this.visited.get(vertex.id) ?? 0; - this.visited.set(vertex.id, visitedCount + 1); - - // only continue visiting if the node has not been visited before or the data frame value of the node changed - return visitedCount === 0 || !this.oldDomain.equals(this.newDomain); - } - - protected override visitDataflowNode(vertex: Exclude): void { - const node = this.getNormalizedAst(getVertexRootId(vertex)); - - if(node === undefined) { - return; - } - this.oldDomain = node.info.dataFrame?.domain ?? DataFrameStateDomain.bottom(); - super.visitDataflowNode(vertex); - - if(this.config.dfg.unknownSideEffects.has(getVertexRootId(vertex))) { - this.newDomain = this.newDomain.bottom(); - } - if(this.shouldWiden(vertex)) { - this.newDomain = this.oldDomain.widen(this.newDomain); - } - node.info.dataFrame ??= {}; - node.info.dataFrame.domain = this.newDomain; - } - - protected onVariableDefinition({ vertex }: { vertex: DataflowGraphVertexVariableDefinition; }): void { - const node = this.getNormalizedAst(vertex.id); - - if(node !== undefined) { - // mark variable definitions as "unassigned", as the evaluation of the assigned expression is delayed until processing the assignment - node.info.dataFrame ??= { marker: DataFrameInfoMarker.Unassigned }; - } - } - - protected override onAssignmentCall({ call, target, source }: { call: DataflowGraphVertexFunctionCall, target?: NodeId, source?: NodeId }): void { - const node = this.getNormalizedAst(call.id); - const targetNode = this.getNormalizedAst(target); - const sourceNode = this.getNormalizedAst(source); - - if(node !== undefined && isAssignmentTarget(targetNode) && sourceNode !== undefined) { - node.info.dataFrame = mapDataFrameVariableAssignment(targetNode, sourceNode, this.config.dfg, this.config.ctx); - this.applyDataFrameAssignment(node); - this.clearUnassignedInfo(targetNode); - } - } - - protected override onAccessCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { - const node = this.getNormalizedAst(call.id); - - if(node !== undefined) { - node.info.dataFrame = mapDataFrameAccess(node, this.config.dfg, this.config.ctx); - this.applyDataFrameExpression(node); - } - } - - protected override onDefaultFunctionCall({ call }: { call: DataflowGraphVertexFunctionCall }): void { - const node = this.getNormalizedAst(call.id); - - if(node !== undefined) { - node.info.dataFrame = mapDataFrameFunctionCall(node, this.config.dfg, this.config.ctx); - this.applyDataFrameExpression(node); - } - } - - protected override onReplacementCall({ call, source, target }: { call: DataflowGraphVertexFunctionCall, source: NodeId | undefined, target: NodeId | undefined }): void { - const node = this.getNormalizedAst(call.id); - const targetNode = this.getNormalizedAst(target); - const sourceNode = this.getNormalizedAst(source); - - if(node !== undefined && targetNode !== undefined && sourceNode !== undefined) { - node.info.dataFrame = mapDataFrameReplacementFunction(node, sourceNode, this.config.dfg, this.config.ctx); - this.applyDataFrameExpression(node); - this.clearUnassignedInfo(targetNode); - } - } - - private applyDataFrameAssignment(node: RNode) { - if(!hasDataFrameAssignmentInfo(node)) { - return; - } - const value = resolveIdToDataFrameShape(node.info.dataFrame.expression, this.config.dfg, this.newDomain); - - if(value !== undefined) { - this.newDomain.set(node.info.dataFrame.identifier, value); - const identifier = this.getNormalizedAst(node.info.dataFrame.identifier); - - if(identifier !== undefined) { - identifier.info.dataFrame ??= {}; - identifier.info.dataFrame.domain = this.newDomain.create(this.newDomain.value); - } - } - } - - private applyDataFrameExpression(node: RNode) { - if(!hasDataFrameExpressionInfo(node)) { - return; - } - const maxColNames = this.config.ctx.config.abstractInterpretation.dataFrame.maxColNames; - let value = DataFrameDomain.top(maxColNames); - - for(const { operation, operand, type, options, ...args } of node.info.dataFrame.operations) { - const operandValue = operand !== undefined ? resolveIdToDataFrameShape(operand, this.config.dfg, this.newDomain) : value; - value = applyDataFrameSemantics(operation, operandValue ?? DataFrameDomain.top(maxColNames), args, options); - const constraintType = type ?? getConstraintType(operation); - - if(operand !== undefined && constraintType === ConstraintType.OperandModification) { - this.newDomain.set(operand, value); - - for(const origin of getVariableOrigins(operand, this.config.dfg)) { - this.newDomain.set(origin.info.id, value); - } - } else if(constraintType === ConstraintType.ResultPostcondition) { - this.newDomain.set(node.info.id, value); - } - } - } - - /** We only process vertices of leaf nodes and exit vertices (no entry nodes of complex nodes) */ - private shouldSkipVertex(vertex: CfgSimpleVertex) { - return isMarkerVertex(vertex) ? vertex.type !== CfgVertexType.EndMarker : vertex.end !== undefined; - } - - /** Get all AST nodes for the predecessor vertices that are leaf nodes and exit vertices */ - private getPredecessorNodes(vertexId: NodeId): RNode[] { - return this.config.controlFlow.graph.outgoingEdges(vertexId)?.keys() // outgoing dependency edges are incoming CFG edges - .map(id => this.getCfgVertex(id)) - .flatMap(vertex => { - if(vertex === undefined) { - return []; - } else if(this.shouldSkipVertex(vertex)) { - return this.getPredecessorNodes(vertex.id); - } else { - return [this.getNormalizedAst(getVertexRootId(vertex))]; - } - }) - .filter(isNotUndefined) - .toArray() ?? []; - } - - private shouldWiden(vertex: Exclude): boolean { - return (this.visited.get(vertex.id) ?? 0) >= this.config.ctx.config.abstractInterpretation.dataFrame.wideningThreshold; - } - - private clearUnassignedInfo(node: RNode) { - if(hasDataFrameInfoMarker(node, DataFrameInfoMarker.Unassigned)) { - if(node.info.dataFrame?.domain !== undefined) { - node.info.dataFrame = { domain: node.info.dataFrame.domain }; - } else { - delete node.info.dataFrame; - } - } - } -} diff --git a/src/abstract-interpretation/data-frame/mappers/access-mapper.ts b/src/abstract-interpretation/data-frame/mappers/access-mapper.ts index 9d2d83dca0e..bde179d4532 100644 --- a/src/abstract-interpretation/data-frame/mappers/access-mapper.ts +++ b/src/abstract-interpretation/data-frame/mappers/access-mapper.ts @@ -1,15 +1,15 @@ import { VariableResolve } from '../../../config'; import type { ResolveInfo } from '../../../dataflow/eval/resolve/alias-tracking'; import type { DataflowGraph } from '../../../dataflow/graph/graph'; +import type { ReadOnlyFlowrAnalyzerContext } from '../../../project/context/flowr-analyzer-context'; import type { RNode } from '../../../r-bridge/lang-4.x/ast/model/model'; import type { RAccess, RIndexAccess, RNamedAccess } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-access'; import { EmptyArgument, type RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; -import type { DataFrameExpressionInfo, DataFrameOperation } from '../absint-info'; import { resolveIdToArgValue, resolveIdToArgValueSymbolName, unquoteArgument } from '../resolve-args'; +import type { DataFrameOperation, DataFrameShapeInferenceVisitor } from '../shape-inference'; import { getArgumentValue, isDataFrameArgument } from './arguments'; -import type { ReadOnlyFlowrAnalyzerContext } from '../../../project/context/flowr-analyzer-context'; /** * Special named arguments of index-based access operators @@ -20,40 +20,38 @@ const SpecialAccessArgumentsMapper: Record = }; /** - * Maps a concrete data frame access to abstract data frame operations. + * Maps a concrete data frame access operation to abstract data frame operations. * @param node - The R node of the access * @param dfg - The data flow graph for resolving the arguments - * @param ctx - The read-only Flowr analyzer context - * @returns Data frame expression info containing the mapped abstract data frame operations, or `undefined` if the node does not represent a data frame access + * @param ctx - The current flowR analyzer context + * @returns The mapped abstract data frame operations for the access operation, or `undefined` if the node does not represent a data frame access operation */ export function mapDataFrameAccess( node: RNode, + inference: DataFrameShapeInferenceVisitor, dfg: DataflowGraph, ctx: ReadOnlyFlowrAnalyzerContext -): DataFrameExpressionInfo | undefined { +): DataFrameOperation[] | undefined { if(node.type !== RType.Access) { return; } const resolveInfo = { graph: dfg, idMap: dfg.idMap, full: true, resolve: VariableResolve.Alias, ctx }; - let operations: DataFrameOperation[] | undefined; if(isStringBasedAccess(node)) { - operations = mapDataFrameNamedColumnAccess(node, resolveInfo); + return mapDataFrameNamedColumnAccess(node, inference, resolveInfo); } else { - operations = mapDataFrameIndexColRowAccess(node, resolveInfo); - } - if(operations !== undefined) { - return { type: 'expression', operations: operations }; + return mapDataFrameIndexColRowAccess(node, inference, resolveInfo); } } function mapDataFrameNamedColumnAccess( access: RNamedAccess, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { const dataFrame = access.accessed; - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } const colname = resolveIdToArgValueSymbolName(access.access[0], info); @@ -67,6 +65,7 @@ function mapDataFrameNamedColumnAccess( function mapDataFrameIndexColRowAccess( access: RIndexAccess, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { const dataFrame = access.accessed; @@ -74,7 +73,7 @@ function mapDataFrameIndexColRowAccess( const exact = getArgumentValue(access.access, 'exact', info); const args = getAccessArgs(access.operator, access.access); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } else if(args.every(arg => arg === EmptyArgument)) { return [{ operation: 'identity', operand: dataFrame.info.id }]; diff --git a/src/abstract-interpretation/data-frame/mappers/arguments.ts b/src/abstract-interpretation/data-frame/mappers/arguments.ts index 925c3624e9c..a263f034cd0 100644 --- a/src/abstract-interpretation/data-frame/mappers/arguments.ts +++ b/src/abstract-interpretation/data-frame/mappers/arguments.ts @@ -10,9 +10,8 @@ import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/pro import { visitAst } from '../../../r-bridge/lang-4.x/ast/model/processing/visitor'; import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; import { RNull } from '../../../r-bridge/lang-4.x/convert-values'; -import type { AbstractInterpretationInfo } from '../absint-info'; import { resolveIdToArgName, resolveIdToArgValue, unquoteArgument } from '../resolve-args'; -import { resolveIdToDataFrameShape } from '../shape-inference'; +import type { DataFrameShapeInferenceVisitor } from '../shape-inference'; /** Regular expression representing valid columns names, e.g. for `data.frame` */ const ColNamesRegex = /^[A-Za-z.][A-Za-z0-9_.]*$/; @@ -206,17 +205,17 @@ export function hasCriticalArgument( } /** - * Checks if a given argument has any data frame shape information and therefore may represent a data frame. - * @param arg - The argument to check - * @param info - Argument resolve information - * @returns Whether the argument has any data frame shape information and may represent a data frame + * Checks if a given argument has inferred data frame shape information and therefore represents a data frame + * @param arg - The argument to check + * @param inference - The data frame shape inference visitor to use + * @returns Whether the argument has inferred data frame shape information and represents a data frame */ -export function isDataFrameArgument(arg: RNode | undefined, info: ResolveInfo): - arg is RNode>; -export function isDataFrameArgument(arg: RFunctionArgument | undefined, info: ResolveInfo): - arg is RArgument> & { value: RNode> }; -export function isDataFrameArgument(arg: RNode | RFunctionArgument | undefined, info: ResolveInfo): boolean { - return arg !== EmptyArgument && resolveIdToDataFrameShape(arg, info.graph) !== undefined; +export function isDataFrameArgument(arg: RNode | undefined, inference: DataFrameShapeInferenceVisitor): + arg is RNode; +export function isDataFrameArgument(arg: RFunctionArgument | undefined, inference: DataFrameShapeInferenceVisitor): + arg is RArgument & { value: RNode }; +export function isDataFrameArgument(arg: RNode | RFunctionArgument | undefined, inference: DataFrameShapeInferenceVisitor): boolean { + return arg !== EmptyArgument && inference.getValue(arg) !== undefined; } /** diff --git a/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts b/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts deleted file mode 100644 index 184c8f88334..00000000000 --- a/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts +++ /dev/null @@ -1,44 +0,0 @@ -import { VariableResolve } from '../../../config'; -import type { DataflowGraph } from '../../../dataflow/graph/graph'; -import type { RNode } from '../../../r-bridge/lang-4.x/ast/model/model'; -import type { RString } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-string'; -import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; -import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; -import type { DataFrameAssignmentInfo } from '../absint-info'; -import { isDataFrameArgument } from './arguments'; -import type { ReadOnlyFlowrAnalyzerContext } from '../../../project/context/flowr-analyzer-context'; - -/** - * Maps a concrete data frame assignment to data frame assignment info containing the ids of the identifier and assigned expression. - * We currently do not support function assignments dealing with data frames. - * @param identifier - The R node of the variable identifier - * @param expression - The R node of the assigned expression - * @param dfg - The data flow graph for resolving the arguments - * @param ctx - The analysis context - * @returns Data frame assignment info containing the IDs of the identifier and expression, or `undefined` if the node does not represent a data frame assignment - */ -export function mapDataFrameVariableAssignment( - identifier: RSymbol | RString, - expression: RNode, - dfg: DataflowGraph, - ctx: ReadOnlyFlowrAnalyzerContext -): DataFrameAssignmentInfo | undefined { - const resolveInfo = { graph: dfg, idMap: dfg.idMap, full: true, resolve: VariableResolve.Alias, ctx }; - - if(!isDataFrameArgument(expression, resolveInfo)) { - return; - } - return { - type: 'assignment', - identifier: identifier.info.id, - expression: expression.info.id - }; -} - -/** - * Checks whether a R node represents an assignment target, i.e. is a `RSymbol` or `RString`. - */ -export function isAssignmentTarget(node: RNode | undefined): node is RSymbol | RString { - return node?.type === RType.Symbol || node?.type === RType.String; -} diff --git a/src/abstract-interpretation/data-frame/mappers/function-mapper.ts b/src/abstract-interpretation/data-frame/mappers/function-mapper.ts index fdf632aaa6e..9b5c12a5c3c 100644 --- a/src/abstract-interpretation/data-frame/mappers/function-mapper.ts +++ b/src/abstract-interpretation/data-frame/mappers/function-mapper.ts @@ -11,31 +11,11 @@ import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; import { requestFromInput, type RParseRequest } from '../../../r-bridge/retriever'; import { assertUnreachable, isNotUndefined, isUndefined } from '../../../util/assert'; import { readLineByLineSync } from '../../../util/files'; -import type { DataFrameExpressionInfo, DataFrameOperation } from '../absint-info'; import { DataFrameDomain } from '../dataframe-domain'; -import { - resolveIdToArgName, - resolveIdToArgValue, - resolveIdToArgValueSymbolName, - resolveIdToArgVectorLength, - unescapeSpecialChars -} from '../resolve-args'; +import { resolveIdToArgName, resolveIdToArgValue, resolveIdToArgValueSymbolName, resolveIdToArgVectorLength, unescapeSpecialChars } from '../resolve-args'; import type { ConstraintType } from '../semantics'; -import { resolveIdToDataFrameShape } from '../shape-inference'; -import { - escapeRegExp, - filterValidNames, - type FunctionParameterLocation, - getArgumentValue, - getEffectiveArgs, - getFunctionArgument, - getFunctionArguments, - getUnresolvedSymbolsInExpression, - hasCriticalArgument, - isDataFrameArgument, - isNamedArgument, - isRNull -} from './arguments'; +import type { DataFrameOperation, DataFrameShapeInferenceVisitor } from '../shape-inference'; +import { escapeRegExp, filterValidNames, getArgumentValue, getEffectiveArgs, getFunctionArgument, getFunctionArguments, getUnresolvedSymbolsInExpression, hasCriticalArgument, isDataFrameArgument, isNamedArgument, isRNull, type FunctionParameterLocation } from './arguments'; /** * Represents the different types of data frames in R @@ -588,6 +568,7 @@ type OtherDataFrameFunctionMapping = OtherDataFrameEntryPoint | OtherDataFrameTr type DataFrameFunctionMapping = ( args: readonly RFunctionArgument[], params: Params, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo, ctx: ReadOnlyFlowrAnalyzerContext ) => DataFrameOperation[] | undefined; @@ -611,18 +592,18 @@ type DataFrameFunctionParamsMapping = { * @param node - The R node of the function call * @param dfg - The data flow graph for resolving the arguments * @param ctx - The current flowR analyzer context - * @returns Data frame expression info containing the mapped abstract data frame operations, or `undefined` if the node does not represent a data frame function call + * @returns The mapped abstract data frame operations for the function call, or `undefined` if the node does not represent a data frame function call */ export function mapDataFrameFunctionCall( node: RNode, + inference: DataFrameShapeInferenceVisitor, dfg: DataflowGraph, ctx: ReadOnlyFlowrAnalyzerContext -): DataFrameExpressionInfo | undefined { +): DataFrameOperation[] | undefined { if(node.type !== RType.FunctionCall || !node.named) { return; } const resolveInfo = { graph: dfg, idMap: dfg.idMap, full: true, resolve: VariableResolve.Alias, ctx }; - let operations: DataFrameOperation[] | undefined; if(isDataFrameFunction(node.functionName.content)) { const functionName = node.functionName.content as Name; @@ -631,9 +612,9 @@ export function mapDataFrameFunctionCall( const args = getFunctionArguments(node, dfg); if(hasCriticalArgument(args, params.critical, resolveInfo)) { - operations = [{ operation: 'unknown', operand: undefined }]; + return [{ operation: 'unknown', operand: undefined }]; } else { - operations = mapper(args, params, resolveInfo, ctx); + return mapper(args, params, inference, resolveInfo, ctx); } } else { const mapping = getOtherDataFrameFunction(node.functionName.content); @@ -641,17 +622,14 @@ export function mapDataFrameFunctionCall( if(mapping === undefined) { return; } else if(mapping.type === 'entry_point') { - operations = [{ operation: 'unknown', operand: undefined }]; + return [{ operation: 'unknown', operand: undefined }]; } else if(mapping.type === 'transformation' || mapping.type === 'modification') { const args = getFunctionArguments(node, dfg); - operations = mapDataFrameUnknown(args, mapping, resolveInfo); + return mapDataFrameUnknown(args, mapping, inference, resolveInfo); } else { assertUnreachable(mapping); } } - if(operations !== undefined) { - return { type: 'expression', operations }; - } } function isDataFrameFunction(functionName: string): functionName is DataFrameFunction { @@ -670,6 +648,7 @@ function mapDataFrameCreate( noDupNames: FunctionParameterLocation, special: string[] }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] { const checkNames = getArgumentValue(args, params.checkNames, info); @@ -701,6 +680,7 @@ function mapDataFrameCreate( function mapDataFrameConvert( args: readonly RFunctionArgument[], params: { dataFrame: FunctionParameterLocation }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { const dataFrame = getFunctionArgument(args, params.dataFrame, info); @@ -728,6 +708,7 @@ function mapDataFrameRead( noDupNames: FunctionParameterLocation, noEmptyNames?: boolean }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo, ctx: ReadOnlyFlowrAnalyzerContext ): DataFrameOperation[] { @@ -793,10 +774,11 @@ function mapDataFrameRead( function mapDataFrameColBind( args: readonly RFunctionArgument[], params: { special: string[] }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { args = getEffectiveArgs(args, params.special); - const dataFrame = args.find(arg => isDataFrameArgument(arg, info)); + const dataFrame = args.find(arg => isDataFrameArgument(arg, inference)); if(dataFrame === undefined) { return; @@ -809,7 +791,7 @@ function mapDataFrameColBind( for(const arg of args) { if(arg !== dataFrame && arg !== EmptyArgument) { - const otherDataFrame = resolveIdToDataFrameShape(arg.value, info.graph); + const otherDataFrame = inference.getValue(arg.value); if(otherDataFrame !== undefined) { result.push({ @@ -840,10 +822,11 @@ function mapDataFrameColBind( function mapDataFrameRowBind( args: readonly RFunctionArgument[], params: { special: string[] }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { args = getEffectiveArgs(args, params.special); - const dataFrame = args.find(arg => isDataFrameArgument(arg, info)); + const dataFrame = args.find(arg => isDataFrameArgument(arg, inference)); if(dataFrame === undefined) { return; @@ -856,7 +839,7 @@ function mapDataFrameRowBind( for(const arg of args) { if(arg !== dataFrame && arg !== EmptyArgument) { - const otherDataFrame = resolveIdToDataFrameShape(arg.value, info.graph); + const otherDataFrame = inference.getValue(arg.value); if(otherDataFrame !== undefined) { result.push({ @@ -886,11 +869,12 @@ function mapDataFrameRowBind( function mapDataFrameHeadTail( args: readonly RFunctionArgument[], params: { dataFrame: FunctionParameterLocation, amount: FunctionParameterLocation }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { const dataFrame = getFunctionArgument(args, params.dataFrame, info); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } const result: DataFrameOperation[] = []; @@ -928,11 +912,12 @@ function mapDataFrameSubset( select: FunctionParameterLocation, drop: FunctionParameterLocation }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { const dataFrame = getFunctionArgument(args, params.dataFrame, info); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } else if(args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; @@ -1002,12 +987,13 @@ function mapDataFrameSubset( function mapDataFrameFilter( args: readonly RFunctionArgument[], params: { dataFrame: FunctionParameterLocation, special: string[] }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { args = getEffectiveArgs(args, params.special); const dataFrame = getFunctionArgument(args, params.dataFrame, info); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } else if(args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; @@ -1039,12 +1025,13 @@ function mapDataFrameFilter( function mapDataFrameSelect( args: readonly RFunctionArgument[], params: { dataFrame: FunctionParameterLocation, special: string[] }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { args = getEffectiveArgs(args, params.special); const dataFrame = getFunctionArgument(args, params.dataFrame, info); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } const result: DataFrameOperation[] = []; @@ -1108,12 +1095,13 @@ function mapDataFrameMutate( checkNames?: boolean, noDupNames?: boolean }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { args = getEffectiveArgs(args, params.special); const dataFrame = getFunctionArgument(args, params.dataFrame, info); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } else if(args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; @@ -1173,12 +1161,13 @@ function mapDataFrameGroupBy( by: FunctionParameterLocation, special: string[] }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { args = getEffectiveArgs(args, params.special); const dataFrame = getFunctionArgument(args, params.dataFrame, info); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } else if(args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; @@ -1211,12 +1200,13 @@ function mapDataFrameGroupBy( function mapDataFrameSummarize( args: readonly RFunctionArgument[], params: { dataFrame: FunctionParameterLocation, special: string[] }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { args = getEffectiveArgs(args, params.special); const dataFrame = getFunctionArgument(args, params.dataFrame, info); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } const result: DataFrameOperation[] = []; @@ -1255,6 +1245,7 @@ function mapDataFrameJoin( joinLeft: FunctionParameterLocation, joinRight: FunctionParameterLocation }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo, ctx: ReadOnlyFlowrAnalyzerContext ): DataFrameOperation[] | undefined { @@ -1263,7 +1254,7 @@ function mapDataFrameJoin( const joinLeft = getArgumentValue(args, params.joinLeft, info); const joinRight = getArgumentValue(args, params.joinRight, info); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } else if(args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; @@ -1275,7 +1266,7 @@ function mapDataFrameJoin( const otherArg = getFunctionArgument(args, params.otherDataFrame, info); const byArg = getFunctionArgument(args, params.by, info); - const otherDataFrame = resolveIdToDataFrameShape(otherArg, info.graph) ?? DataFrameDomain.top(ctx.config.abstractInterpretation.dataFrame.maxColNames); + const otherDataFrame = inference.getValue(otherArg) ?? DataFrameDomain.top(ctx.config.abstractInterpretation.dataFrame.maxColNames); let byCols: (string | number | undefined)[] | undefined; const joinType = getJoinType(joinAll, joinLeft, joinRight); @@ -1322,12 +1313,13 @@ function mapDataFrameIdentity( special: string[], disallowNamedArgs?: boolean }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { args = getEffectiveArgs(args, params.special); const dataFrame = getFunctionArgument(args, params.dataFrame, info); - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } else if(params.disallowNamedArgs && args.some(isNamedArgument)) { return [{ operation: 'unknown', operand: dataFrame.value.info.id }]; @@ -1344,6 +1336,7 @@ function mapDataFrameUnknown( dataFrame?: FunctionParameterLocation, constraintType?: Exclude }, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { let dataFrame; @@ -1351,10 +1344,10 @@ function mapDataFrameUnknown( if(params.dataFrame !== undefined) { dataFrame = getFunctionArgument(args, params.dataFrame, info); } else { - dataFrame = args.find(arg => isDataFrameArgument(arg ,info)); + dataFrame = args.find(arg => isDataFrameArgument(arg, inference)); } - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } return [{ diff --git a/src/abstract-interpretation/data-frame/mappers/replacement-mapper.ts b/src/abstract-interpretation/data-frame/mappers/replacement-mapper.ts index b600bf2f766..a0406032b6a 100644 --- a/src/abstract-interpretation/data-frame/mappers/replacement-mapper.ts +++ b/src/abstract-interpretation/data-frame/mappers/replacement-mapper.ts @@ -4,6 +4,7 @@ import type { ResolveInfo } from '../../../dataflow/eval/resolve/alias-tracking' import type { DataflowGraph } from '../../../dataflow/graph/graph'; import { isFunctionCallVertex } from '../../../dataflow/graph/vertex'; import { toUnnamedArgument } from '../../../dataflow/internal/process/functions/call/argument/make-argument'; +import type { ReadOnlyFlowrAnalyzerContext } from '../../../project/context/flowr-analyzer-context'; import type { RNode } from '../../../r-bridge/lang-4.x/ast/model/model'; import type { RAccess, RIndexAccess, RNamedAccess } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-access'; import type { RArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-argument'; @@ -11,12 +12,11 @@ import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-func import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id'; import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; -import type { DataFrameExpressionInfo, DataFrameOperation } from '../absint-info'; import { resolveIdToArgStringVector, resolveIdToArgValue, resolveIdToArgValueSymbolName } from '../resolve-args'; import { ConstraintType } from '../semantics'; +import type { DataFrameOperation, DataFrameShapeInferenceVisitor } from '../shape-inference'; import { isStringBasedAccess } from './access-mapper'; import { isDataFrameArgument, isRNull } from './arguments'; -import type { ReadOnlyFlowrAnalyzerContext } from '../../../project/context/flowr-analyzer-context'; /** Mapper for mapping the supported data frame replacement functions to mapper functions */ const DataFrameReplacementFunctionMapper = { @@ -36,6 +36,7 @@ const DataFrameReplacementFunctionMapper = { type DataFrameReplacementFunctionMapping = ( operand: RArgument, expression: RNode, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo, parent?: RNode ) => DataFrameOperation[] | undefined; @@ -44,43 +45,40 @@ type DataFrameReplacementFunctionMapping = ( type DataFrameReplacementFunction = keyof typeof DataFrameReplacementFunctionMapper; /** - * Maps a concrete data frame replacement function to abstract data frame operations. - * @param node - The R node of the replacement function + * Maps a concrete data frame replacement function call to abstract data frame operations. + * @param node - The R node of the replacement function call * @param dfg - The data flow graph for resolving the arguments - * @param ctx - The read-only Flowr analysis context - * @returns Data frame expression info containing the mapped abstract data frame operations, or `undefined` if the node does not represent a data frame replacement function + * @param ctx - The current flowR analysis context + * @returns The mapped abstract data frame operations for the replacement function call, or `undefined` if the node does not represent a data frame replacement function call */ export function mapDataFrameReplacementFunction( node: RNode, expression: RNode, + inference: DataFrameShapeInferenceVisitor, dfg: DataflowGraph, ctx: ReadOnlyFlowrAnalyzerContext -): DataFrameExpressionInfo | undefined { +): DataFrameOperation[] | undefined { const parent = hasParentReplacement(node, dfg) ? dfg.idMap?.get(node.info.parent) : undefined; const resolveInfo = { graph: dfg, idMap: dfg.idMap, full: true, resolve: VariableResolve.Alias, ctx }; - let operations: DataFrameOperation[] | undefined; if(node.type === RType.Access) { if(node.access.every(arg => arg === EmptyArgument)) { - operations = mapDataFrameContentAssignment(node, expression, resolveInfo); + return mapDataFrameContentAssignment(node, expression, inference); } else if(isStringBasedAccess(node)) { - operations = mapDataFrameNamedColumnAssignment(node, expression, resolveInfo); + return mapDataFrameNamedColumnAssignment(node, expression, inference, resolveInfo); } else { - operations = mapDataFrameIndexColRowAssignment(node, expression, resolveInfo); + return mapDataFrameIndexColRowAssignment(node, expression, inference, resolveInfo); } } else if(node.type === RType.FunctionCall && node.named && node.arguments.length === 1 && node.arguments[0] !== EmptyArgument) { if(isDataFrameReplacement(node.functionName.content)) { const functionName = node.functionName.content; - const functionMapping = DataFrameReplacementFunctionMapper[functionName]; + const mapper = DataFrameReplacementFunctionMapper[functionName]; - operations = functionMapping(node.arguments[0], expression, resolveInfo, parent); + return mapper(node.arguments[0], expression, inference, resolveInfo, parent); } else { - operations = mapDataFrameUnknownAssignment(node.arguments[0], expression, resolveInfo); + return mapDataFrameUnknownAssignment(node.arguments[0], expression, inference); } } - if(operations !== undefined) { - return { type: 'expression', operations: operations }; - } } function isDataFrameReplacement(functionName: string): functionName is DataFrameReplacementFunction { @@ -97,11 +95,11 @@ function hasParentReplacement(node: RNode, dfg: DataflowGraph function mapDataFrameContentAssignment( access: RAccess, expression: RNode, - info: ResolveInfo + inference: DataFrameShapeInferenceVisitor ): DataFrameOperation[] | undefined { const dataFrame = access.accessed; - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } if(isRNull(expression)) { @@ -123,11 +121,12 @@ function mapDataFrameContentAssignment( function mapDataFrameNamedColumnAssignment( access: RNamedAccess, expression: RNode, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { const dataFrame = access.accessed; - if(!isDataFrameArgument(dataFrame, info)) { + if(!isDataFrameArgument(dataFrame, inference)) { return; } const colname = resolveIdToArgValueSymbolName(access.access[0], info); @@ -152,12 +151,13 @@ function mapDataFrameNamedColumnAssignment( function mapDataFrameIndexColRowAssignment( access: RIndexAccess, expression: RNode, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo ): DataFrameOperation[] | undefined { const dataFrame = access.accessed; const args = access.access; - if(!isDataFrameArgument(dataFrame, info) || args.every(arg => arg === EmptyArgument)) { + if(!isDataFrameArgument(dataFrame, inference) || args.every(arg => arg === EmptyArgument)) { return; } const result: DataFrameOperation[] = []; @@ -212,10 +212,11 @@ function mapDataFrameIndexColRowAssignment( function mapDataFrameColNamesAssignment( operand: RArgument, expression: RNode, + inference: DataFrameShapeInferenceVisitor, info: ResolveInfo, parent?: RNode ): DataFrameOperation[] | undefined { - if(!isDataFrameArgument(operand, info)) { + if(!isDataFrameArgument(operand, inference)) { return; } const argument = info.idMap !== undefined ? toUnnamedArgument(expression, info.idMap) : EmptyArgument; @@ -232,9 +233,9 @@ function mapDataFrameColNamesAssignment( function mapDataFrameRowNamesAssignment( operand: RArgument, expression: RNode, - info: ResolveInfo, + inference: DataFrameShapeInferenceVisitor ): DataFrameOperation[] | undefined { - if(!isDataFrameArgument(operand, info)) { + if(!isDataFrameArgument(operand, inference)) { return; } return [{ @@ -247,9 +248,9 @@ function mapDataFrameRowNamesAssignment( function mapDataFrameDimNamesAssignment( operand: RArgument, expression: RNode, - info: ResolveInfo + inference: DataFrameShapeInferenceVisitor ): DataFrameOperation[] | undefined { - if(!isDataFrameArgument(operand, info)) { + if(!isDataFrameArgument(operand, inference)) { return; } return [{ @@ -262,9 +263,9 @@ function mapDataFrameDimNamesAssignment( function mapDataFrameUnknownAssignment( operand: RArgument, expression: RNode, - info: ResolveInfo + inference: DataFrameShapeInferenceVisitor ): DataFrameOperation[] | undefined { - if(!isDataFrameArgument(operand, info)) { + if(!isDataFrameArgument(operand, inference)) { return; } return [{ diff --git a/src/abstract-interpretation/data-frame/shape-inference.ts b/src/abstract-interpretation/data-frame/shape-inference.ts index 184f7410437..d91b31f1a88 100644 --- a/src/abstract-interpretation/data-frame/shape-inference.ts +++ b/src/abstract-interpretation/data-frame/shape-inference.ts @@ -1,118 +1,138 @@ -import { type ControlFlowInformation, getVertexRootId } from '../../control-flow/control-flow-graph'; -import type { DataflowGraph } from '../../dataflow/graph/graph'; -import { VertexType } from '../../dataflow/graph/vertex'; -import { getOriginInDfg, OriginType } from '../../dataflow/origin/dfg-get-origin'; -import type { ReadOnlyFlowrAnalyzerContext } from '../../project/context/flowr-analyzer-context'; -import type { RNode } from '../../r-bridge/lang-4.x/ast/model/model'; -import { EmptyArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import type { NormalizedAst, ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { DataflowGraphVertexFunctionCall } from '../../dataflow/graph/vertex'; +import type { NoInfo, RNode } from '../../r-bridge/lang-4.x/ast/model/model'; +import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; -import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; -import { isNotUndefined } from '../../util/assert'; -import { AbstractDomain } from '../domains/abstract-domain'; -import { type AbstractInterpretationInfo, DataFrameInfoMarker, hasDataFrameInfoMarker } from './absint-info'; -import { DataFrameShapeInferenceVisitor } from './absint-visitor'; -import { type DataFrameDomain, DataFrameStateDomain } from './dataframe-domain'; +import { AbstractInterpretationVisitor, type AbsintVisitorConfiguration } from '../absint-visitor'; +import { DataFrameDomain, DataFrameStateDomain } from './dataframe-domain'; +import { mapDataFrameAccess } from './mappers/access-mapper'; +import { mapDataFrameFunctionCall } from './mappers/function-mapper'; +import { mapDataFrameReplacementFunction } from './mappers/replacement-mapper'; +import { applyDataFrameSemantics, ConstraintType, getConstraintType, type DataFrameOperationArgs, type DataFrameOperationName, type DataFrameOperationOptions } from './semantics'; /** - * Infers the shape of data frames by performing abstract interpretation using the control flow graph of a program. - * This directly attaches the inferred data frames shapes to the AST (see {@link AbstractInterpretationInfo}). - * @param cfinfo - The control flow information containing the control flow graph - * @param dfg - The data flow graph to resolve variable origins and function arguments - * @param ast - The abstract syntax tree to resolve node IDs to AST nodes - * @param ctx - The current flowr analyzer context - * @returns The abstract data frame state at the exit node of the control flow graph (see {@link DataFrameStateDomain}). - * The abstract data frame states for all other nodes are attached to the AST. + * An abstract data frame operation. + * - `operation` contains the type of the abstract operation (see {@link DataFrameOperationName}) + * - `operand` contains the ID of the data frame operand of the operation (may be `undefined`) + * - `type` optionally contains the constraint type to overwrite the default type of the operation (see {@link ConstraintType}) + * - `options` optionally contains additional options for the abstract operation (see {@link DataFrameOperationOptions}) + * - `...args` contains the arguments of the abstract operation (see {@link DataFrameOperationArgs}) */ -export function inferDataFrameShapes( - cfinfo: ControlFlowInformation, - dfg: DataflowGraph, - ast: NormalizedAst, - ctx: ReadOnlyFlowrAnalyzerContext -): DataFrameStateDomain { - const visitor = new DataFrameShapeInferenceVisitor({ controlFlow: cfinfo, dfg: dfg, normalizedAst: ast, ctx }); - visitor.start(); - const exitPoints = cfinfo.exitPoints.map(id => cfinfo.graph.getVertex(id)).filter(isNotUndefined); - const exitNodes = exitPoints.map(vertex => ast.idMap.get(getVertexRootId(vertex))).filter(isNotUndefined); - const domains = exitNodes.map(node => node.info.dataFrame?.domain).filter(isNotUndefined); - - return DataFrameStateDomain.bottom().joinAll(domains); +export type DataFrameOperation = { + [Name in OperationName]: { + operation: Name; + operand: NodeId | undefined; + type?: ConstraintType; + options?: DataFrameOperationOptions; + } & DataFrameOperationArgs; +}[OperationName]; + +/** + * An abstract data frame operation without additional options. + * - `operation` contains the type of the abstract operation (see {@link DataFrameOperationName}) + * - `operand` contains the ID of the data frame operand of the operation (may be `undefined`) + * - `...args` contains the arguments of the abstract operation (see {@link DataFrameOperationArgs}) + */ +export type DataFrameOperationType = { + [Name in OperationName]: { + operation: Name; + operand: NodeId | undefined; + } & DataFrameOperationArgs; +}[OperationName]; + +interface DataFrameShapeInferenceConfiguration extends Omit, 'domain'> { + readonly trackOperations?: boolean; } /** - * Resolves the abstract data frame shape of a node in the AST. - * This requires that the data frame shape inference has been executed before using {@link inferDataFrameShapes}. - * @param id - The node or node ID to get the data frame shape for - * @param dfg - The data flow graph used to resolve the data frame shape - * @param domain - An optional abstract data frame state domain used to resolve the data frame shape (defaults to the state at the requested node) - * @returns The abstract data frame shape of the node, or `undefined` if no data frame shape was inferred for the node + * The control flow graph visitor to infer the shape of data frames using abstract interpretation */ -export function resolveIdToDataFrameShape( - id: RNode | NodeId | undefined, - dfg: DataflowGraph | undefined, - domain?: DataFrameStateDomain -): DataFrameDomain | undefined { - const node: RNode | undefined = id === undefined || typeof id === 'object' ? id : dfg?.idMap?.get(id); - domain ??= node?.info.dataFrame?.domain; - - if(dfg === undefined || node === undefined || domain === undefined) { - return; - } else if(domain.has(node.info.id)) { - return domain.get(node.info.id); +export class DataFrameShapeInferenceVisitor extends AbstractInterpretationVisitor { + /** + * The abstract data frame operations the function call nodes are mapped to. + */ + private readonly operations?: Map; + + constructor({ trackOperations = true, ...config }: DataFrameShapeInferenceConfiguration) { + super({ ...config, domain: DataFrameStateDomain.bottom() }); + + if(trackOperations) { + this.operations = new Map(); + } } - const vertex = dfg.getVertex(node.info.id); - const call = vertex?.tag === VertexType.FunctionCall ? vertex : undefined; - const origins = Array.isArray(call?.origin) ? call.origin : []; - if(node.type === RType.Symbol) { - const values = getVariableOrigins(node.info.id, dfg).map(origin => domain.get(origin.info.id)); + public getGlobalState(): ReadonlyMap { + return this.state; + } - if(values.length > 0 && values.every(isNotUndefined)) { - return AbstractDomain.joinAll(values); + /** + * Gets the mapped abstract data frame operations for an AST node. + * This requires that the abstract interpretation visitor has been completed, or at least started.. + * @param id - The ID of the node to get the mapped abstract operations for + * @returns The mapped abstract data frame operations for the node, or `undefined` if no abstract operation was mapped for the node or storing mapped abstract operations is disabled via the visitor config. + */ + public getOperations(id: NodeId | undefined): readonly DataFrameOperation[] | undefined { + return id !== undefined ? this.operations?.get(id) : undefined; + } + + protected override evalFunctionCall(call: DataflowGraphVertexFunctionCall, domain: DataFrameStateDomain): DataFrameStateDomain { + const node = this.getNormalizedAst(call.id); + + if(node === undefined) { + return domain; } - } else if(node.type === RType.Argument && node.value !== undefined) { - return resolveIdToDataFrameShape(node.value, dfg, domain); - } else if(node.type === RType.ExpressionList && node.children.length > 0) { - return resolveIdToDataFrameShape(node.children[node.children.length - 1], dfg, domain); - } else if(node.type === RType.Pipe) { - return resolveIdToDataFrameShape(node.rhs, dfg, domain); - } else if(origins.includes('builtin:pipe')) { - if(node.type === RType.BinaryOp) { - return resolveIdToDataFrameShape(node.rhs, dfg, domain); - } else if(call?.args.length === 2 && call?.args[1] !== EmptyArgument) { - return resolveIdToDataFrameShape(call.args[1].nodeId, dfg, domain); + const operations = mapDataFrameFunctionCall(node, this, this.config.dfg, this.config.ctx); + + return this.applyDataFrameExpression(node, operations, domain); + } + + protected override evalReplacementCall(call: DataflowGraphVertexFunctionCall, target: NodeId, source: NodeId, domain: DataFrameStateDomain): DataFrameStateDomain { + const node = this.getNormalizedAst(call.id); + const targetNode = this.getNormalizedAst(target); + const sourceNode = this.getNormalizedAst(source); + + if(node === undefined || targetNode === undefined || sourceNode === undefined) { + return domain; } - } else if(node.type === RType.IfThenElse) { - if(node.otherwise !== undefined) { - const values = [node.then, node.otherwise].map(entry => resolveIdToDataFrameShape(entry, dfg, domain)); + const operations = mapDataFrameReplacementFunction(node, sourceNode, this, this.config.dfg, this.config.ctx); - if(values.length > 0 && values.every(isNotUndefined)) { - return AbstractDomain.joinAll(values); - } + return this.applyDataFrameExpression(node, operations, domain); + } + + protected override evalAccessCall(call: DataflowGraphVertexFunctionCall, domain: DataFrameStateDomain): DataFrameStateDomain { + const node = this.getNormalizedAst(call.id); + + if(node === undefined) { + return domain; + } + const operations = mapDataFrameAccess(node, this, this.config.dfg, this.config.ctx); + + return this.applyDataFrameExpression(node, operations, domain); + } + + private applyDataFrameExpression(node: RNode, operations: DataFrameOperation[] | undefined, domain: DataFrameStateDomain): DataFrameStateDomain { + if(operations === undefined) { + return domain; + } else if(this.operations !== undefined) { + this.operations.set(node.info.id, operations); } - } else if(origins.includes('builtin:if-then-else') && call?.args.every(arg => arg !== EmptyArgument)) { - if(call.args.length === 3) { - const values = call.args.slice(1, 3).map(entry => resolveIdToDataFrameShape(entry.nodeId, dfg, domain)); + const maxColNames = this.config.ctx.config.abstractInterpretation.dataFrame.maxColNames; + let value = DataFrameDomain.top(maxColNames); + + for(const { operation, operand, type, options, ...args } of operations) { + const operandValue = operand !== undefined ? this.getValue(operand, domain) : value; + value = applyDataFrameSemantics(operation, operandValue ?? DataFrameDomain.top(maxColNames), args, options); + const constraintType = type ?? getConstraintType(operation); - if(values.length > 0 && values.every(isNotUndefined)) { - return AbstractDomain.joinAll(values); + if(operand !== undefined && constraintType === ConstraintType.OperandModification) { + domain.set(operand, value); + + for(const origin of this.getVariableOrigins(operand)) { + domain.set(origin, value); + } + } else if(constraintType === ConstraintType.ResultPostcondition) { + domain.set(node.info.id, value); } } + return domain; } } - -/** - * Gets all origins of a variable in the data flow graph that have already been visited. - * @param node - The node to get the origins for - * @param dfg - The data flow graph for resolving the origins - * @returns The origins nodes of the variable - */ -export function getVariableOrigins(node: NodeId, dfg: DataflowGraph): RNode[] { - // get each variable origin that has already been visited and whose assignment has already been processed - return getOriginInDfg(dfg, node) - ?.filter(origin => origin.type === OriginType.ReadVariableOrigin) - .map | undefined>(entry => dfg.idMap?.get(entry.id)) - .filter(isNotUndefined) - .filter(origin => origin.info.dataFrame?.domain !== undefined) - .filter(origin => !hasDataFrameInfoMarker(origin, DataFrameInfoMarker.Unassigned)) ?? []; -} diff --git a/src/benchmark/slicer.ts b/src/benchmark/slicer.ts index b488d6a04f0..e16fb0d9e9d 100644 --- a/src/benchmark/slicer.ts +++ b/src/benchmark/slicer.ts @@ -23,7 +23,7 @@ import type { SlicerStats, SlicerStatsDfShape } from './stats/stats'; -import type { NormalizedAst, ParentInformation } from '../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { NormalizedAst } from '../r-bridge/lang-4.x/ast/model/processing/decorate'; import type { SlicingCriteria } from '../slicing/criterion/parse'; import { createSlicePipeline, @@ -52,13 +52,8 @@ import { equidistantSampling } from '../util/collections/arrays'; import { type FlowrConfigOptions, getEngineConfig } from '../config'; import type { ControlFlowInformation } from '../control-flow/control-flow-graph'; import { extractCfg } from '../control-flow/extract-cfg'; -import type { RNode } from '../r-bridge/lang-4.x/ast/model/model'; -import { - type AbstractInterpretationInfo, - hasDataFrameExpressionInfo -} from '../abstract-interpretation/data-frame/absint-info'; import type { DataFrameDomain } from '../abstract-interpretation/data-frame/dataframe-domain'; -import { inferDataFrameShapes } from '../abstract-interpretation/data-frame/shape-inference'; +import { DataFrameShapeInferenceVisitor } from '../abstract-interpretation/data-frame/shape-inference'; import type { PosIntervalDomain } from '../abstract-interpretation/domains/positive-interval-domain'; import { Top } from '../abstract-interpretation/domains/lattice'; import { SetRangeDomain } from '../abstract-interpretation/domains/set-range-domain'; @@ -409,7 +404,7 @@ export class BenchmarkSlicer { guard(this.normalizedAst !== undefined, 'normalizedAst should be defined for data frame shape inference'); guard(this.dataflow !== undefined, 'dataflow should be defined for data frame shape inference'); guard(this.controlFlow !== undefined, 'controlFlow should be defined for data frame shape inference'); - guard(this.config !== undefined, 'config should be defined for data frame shape inference'); + guard(this.context !== undefined, 'context should be defined for data frame shape inference'); const ast = this.normalizedAst; const dfg = this.dataflow.graph; @@ -429,7 +424,9 @@ export class BenchmarkSlicer { perNodeStats: new Map() }; - const result = this.measureSimpleStep('infer data frame shapes', () => inferDataFrameShapes(cfinfo, dfg, ast, this.context as FlowrAnalyzerContext)); + const inference = new DataFrameShapeInferenceVisitor({ controlFlow: cfinfo, dfg, normalizedAst: ast, ctx: this.context }); + this.measureSimpleStep('infer data frame shapes', () => inference.start()); + const result = inference.getResult(); stats.numberOfResultConstraints = result.value.size; for(const value of result.value.values()) { @@ -442,25 +439,24 @@ export class BenchmarkSlicer { } } - visitAst(this.normalizedAst.ast.files.map(f => f.root), (node: RNode) => { - if(node.info.dataFrame === undefined) { - return; - } - stats.sizeOfInfo += safeSizeOf([node.info.dataFrame]); - - const expression = hasDataFrameExpressionInfo(node) ? node.info.dataFrame : undefined; - const value = node.info.dataFrame.domain?.get(node.info.id); + visitAst(this.normalizedAst.ast.files.map(file => file.root), node => { + const operations = inference.getOperations(node.info.id); + const value = inference.getValue(node.info.id); // Only store per-node information for nodes representing expressions or nodes with abstract values - if(expression === undefined && value === undefined) { + if(operations === undefined && value === undefined) { stats.numberOfEmptyNodes++; return; } + const state = inference.getState(node.info.id); + stats.sizeOfInfo += safeSizeOf([state]); + const nodeStats: PerNodeStatsDfShape = { - numberOfEntries: node.info.dataFrame?.domain?.value.size ?? 0 + numberOfEntries: state?.value.size ?? 0 }; - if(expression !== undefined) { - nodeStats.mappedOperations = expression.operations.map(op => op.operation); + + if(operations !== undefined) { + nodeStats.mappedOperations = operations.map(op => op.operation); stats.numberOfOperationNodes++; if(value !== undefined) { diff --git a/src/config.ts b/src/config.ts index 003fc07160e..5e7abe08fb7 100644 --- a/src/config.ts +++ b/src/config.ts @@ -161,6 +161,10 @@ export interface FlowrConfigOptions extends MergeableRecord { * Configuration options for abstract interpretation */ readonly abstractInterpretation: { + /** + * The threshold for the number of visitations of a node at which widening should be performed to ensure the termination of the fixpoint iteration + */ + readonly wideningThreshold: number; /** * The configuration of the shape inference for data frames */ @@ -168,11 +172,7 @@ export interface FlowrConfigOptions extends MergeableRecord { /** * The maximum number of columns names to infer for data frames before over-approximating the column names to top */ - readonly maxColNames: number; - /** - * The threshold for the number of visitations of a node at which widening should be performed to ensure the termination of the fixpoint iteration - */ - readonly wideningThreshold: number; + readonly maxColNames: number; /** * Configuration options for reading data frame shapes from loaded external data files, such as CSV files */ @@ -253,10 +253,10 @@ export const defaultConfigOptions: FlowrConfigOptions = { } }, abstractInterpretation: { - dataFrame: { - maxColNames: 50, - wideningThreshold: 4, - readLoadedData: { + wideningThreshold: 4, + dataFrame: { + maxColNames: 50, + readLoadedData: { readExternalFiles: true, maxReadLines: 1e6 } diff --git a/src/control-flow/basic-cfg-guided-visitor.ts b/src/control-flow/basic-cfg-guided-visitor.ts index 93559741cec..76943642933 100644 --- a/src/control-flow/basic-cfg-guided-visitor.ts +++ b/src/control-flow/basic-cfg-guided-visitor.ts @@ -48,20 +48,19 @@ export class BasicCfgGuidedVisitor< } protected startVisitor(start: readonly NodeId[]): void { - const g = this.config.controlFlow.graph; - const n = this.config.defaultVisitingOrder === 'forward' ? - (n: NodeId) => g.ingoingEdges(n) : - (n: NodeId) => g.outgoingEdges(n); + const graph = this.config.controlFlow.graph; + const getNext = this.config.defaultVisitingOrder === 'forward' ? + (node: NodeId) => graph.ingoingEdges(node)?.keys().toArray().toReversed() : + (node: NodeId) => graph.outgoingEdges(node)?.keys().toArray(); const stack = [...start]; while(stack.length > 0) { - const current = stack.shift() as NodeId; + const current = stack.pop() as NodeId; if(!this.visitNode(current)) { continue; } - const outgoing = n(current) ?? []; - for(const [to] of outgoing) { - stack.unshift(to); + for(const next of getNext(current) ?? []) { + stack.push(next); } } } diff --git a/src/control-flow/semantic-cfg-guided-visitor.ts b/src/control-flow/semantic-cfg-guided-visitor.ts index 906b01b7a56..c696c074ae7 100644 --- a/src/control-flow/semantic-cfg-guided-visitor.ts +++ b/src/control-flow/semantic-cfg-guided-visitor.ts @@ -153,7 +153,7 @@ export class SemanticCfgGuidedVisitor< protected override visitFunctionCall(vertex: DataflowGraphVertexFunctionCall) { super.visitFunctionCall(vertex); if(vertex.origin === 'unnamed') { - this.onUnnamedCall({ vertex }); + this.onUnnamedCall({ call: vertex }); } else { this.onDispatchFunctionCallOrigins(vertex, vertex.origin); } @@ -380,7 +380,7 @@ export class SemanticCfgGuidedVisitor< * to rely on {@link SemanticCfgGuidedVisitor#getOrigins|`getOrigins`} to get more information. * @protected */ - protected onUnnamedCall(_data: { vertex: DataflowGraphVertexFunctionCall }) {} + protected onUnnamedCall(_data: { call: DataflowGraphVertexFunctionCall }) {} /** * This event triggers for every function call that is not handled by a specific overload, diff --git a/src/documentation/wiki-interface.ts b/src/documentation/wiki-interface.ts index 129cb1e3262..d852f675bed 100644 --- a/src/documentation/wiki-interface.ts +++ b/src/documentation/wiki-interface.ts @@ -243,10 +243,10 @@ ${codeBlock('json', JSON.stringify( } }, abstractInterpretation: { - dataFrame: { - maxColNames: 20, - wideningThreshold: 4, - readLoadedData: { + wideningThreshold: 4, + dataFrame: { + maxColNames: 20, + readLoadedData: { readExternalFiles: true, maxReadLines: 1_000_000 } diff --git a/src/linter/rules/dataframe-access-validation.ts b/src/linter/rules/dataframe-access-validation.ts index 145c9c30c64..73fd45df2d9 100644 --- a/src/linter/rules/dataframe-access-validation.ts +++ b/src/linter/rules/dataframe-access-validation.ts @@ -1,7 +1,6 @@ -import { type AbstractInterpretationInfo, type DataFrameOperationType, hasDataFrameExpressionInfo } from '../../abstract-interpretation/data-frame/absint-info'; import type { DataFrameDomain } from '../../abstract-interpretation/data-frame/dataframe-domain'; -import { inferDataFrameShapes, resolveIdToDataFrameShape } from '../../abstract-interpretation/data-frame/shape-inference'; -import { SetComparator , NumericalComparator } from '../../abstract-interpretation/domains/satisfiable-domain'; +import { DataFrameShapeInferenceVisitor, type DataFrameOperationType } from '../../abstract-interpretation/data-frame/shape-inference'; +import { NumericalComparator, SetComparator } from '../../abstract-interpretation/domains/satisfiable-domain'; import { amendConfig } from '../../config'; import { extractCfg } from '../../control-flow/extract-cfg'; import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; @@ -14,7 +13,7 @@ import { Ternary } from '../../util/logic'; import { formatRange } from '../../util/mermaid/dfg'; import { type MergeableRecord } from '../../util/objects'; import { rangeFrom, type SourceRange } from '../../util/range'; -import { type LintingResult, type LintingRule , LintingPrettyPrintContext, LintingResultCertainty, LintingRuleCertainty } from '../linter-format'; +import { LintingPrettyPrintContext, LintingResultCertainty, LintingRuleCertainty, type LintingResult, type LintingRule } from '../linter-format'; import { LintingRuleTag } from '../linter-tags'; interface DataFrameAccessOperation { @@ -71,9 +70,10 @@ export const DATA_FRAME_ACCESS_VALIDATION = { }) }; const cfg = extractCfg(data.normalize, ctx, data.dataflow.graph); - inferDataFrameShapes(cfg, data.dataflow.graph, data.normalize, ctx); + const inference = new DataFrameShapeInferenceVisitor({ controlFlow: cfg, dfg: data.dataflow.graph, normalizedAst: data.normalize, ctx }); + inference.start(); - const accessOperations = getAccessOperations(elements); + const accessOperations = getAccessOperations(elements, inference); const accesses: DataFrameAccessOperation[] = []; for(const [nodeId, operations] of accessOperations) { @@ -81,7 +81,7 @@ export const DATA_FRAME_ACCESS_VALIDATION = { for(const operation of operations) { access.operand ??= operation.operand; - access.operandShape ??= resolveIdToDataFrameShape(operation.operand, data.dataflow.graph); + access.operandShape ??= inference.getValue(operation.operand); if(operation.operation === 'accessCols' && operation.columns !== undefined) { access.accessedCols ??= []; @@ -142,16 +142,15 @@ export const DATA_FRAME_ACCESS_VALIDATION = { } as const satisfies LintingRule; function getAccessOperations( - elements: FlowrSearchElements + elements: FlowrSearchElements, + inference: DataFrameShapeInferenceVisitor ): Map[]> { return new Map(elements.getElements() - .map(element => element.node) - .filter(hasDataFrameExpressionInfo) - .map<[NodeId, DataFrameOperationType<'accessCols' | 'accessRows'>[]]>(node => - [node.info.id, node.info.dataFrame.operations - .filter(({ operation }) => operation === 'accessCols' || operation === 'accessRows') + .map<[NodeId, DataFrameOperationType<'accessCols' | 'accessRows'>[]]>(element => + [element.node.info.id, inference.getOperations(element.node.info.id) + ?.filter(({ operation }) => operation === 'accessCols' || operation === 'accessRows') .map(({ operation, operand, type: _type, options: _options, ...args }) => - ({ operation, operand, ...args } as DataFrameOperationType<'accessCols' | 'accessRows'>)) + ({ operation, operand, ...args } as DataFrameOperationType<'accessCols' | 'accessRows'>)) ?? [] ]) .filter(([, operations]) => operations.length > 0) ); diff --git a/src/queries/catalog/df-shape-query/df-shape-query-executor.ts b/src/queries/catalog/df-shape-query/df-shape-query-executor.ts index 28dd05e108e..235c61c98ab 100644 --- a/src/queries/catalog/df-shape-query/df-shape-query-executor.ts +++ b/src/queries/catalog/df-shape-query/df-shape-query-executor.ts @@ -1,5 +1,5 @@ import type { DataFrameDomain } from '../../../abstract-interpretation/data-frame/dataframe-domain'; -import { inferDataFrameShapes, resolveIdToDataFrameShape } from '../../../abstract-interpretation/data-frame/shape-inference'; +import { DataFrameShapeInferenceVisitor } from '../../../abstract-interpretation/data-frame/shape-inference'; import { type SingleSlicingCriterion, slicingCriterionToId } from '../../../slicing/criterion/parse'; import { log } from '../../../util/log'; import type { BasicQueryData } from '../../base-query-format'; @@ -19,7 +19,9 @@ export async function executeDfShapeQuery({ analyzer }: BasicQueryData, queries: const cfg = await analyzer.controlflow(); const start = Date.now(); - const domains = inferDataFrameShapes(cfg, dfg, ast, analyzer.inspectContext()); + const inference = new DataFrameShapeInferenceVisitor({ controlFlow: cfg, dfg, normalizedAst: ast, ctx: analyzer.inspectContext() }); + inference.start(); + const domains = inference.getResult(); if(queries.length === 1 && queries[0].criterion === undefined) { return { @@ -41,7 +43,7 @@ export async function executeDfShapeQuery({ analyzer }: BasicQueryData, queries: } const nodeId = slicingCriterionToId(query.criterion, ast.idMap); const node = ast.idMap.get(nodeId); - const value = resolveIdToDataFrameShape(node?.info.id, dfg); + const value = inference.getValue(node?.info.id); result.set(query.criterion, value); } diff --git a/test/functionality/abstract-interpretation/data-frame/data-frame.ts b/test/functionality/abstract-interpretation/data-frame/data-frame.ts index 97973e2a85a..7e0fdd0dd0d 100644 --- a/test/functionality/abstract-interpretation/data-frame/data-frame.ts +++ b/test/functionality/abstract-interpretation/data-frame/data-frame.ts @@ -1,8 +1,7 @@ import { assert, beforeAll, test } from 'vitest'; -import { type AbstractInterpretationInfo, type DataFrameOperation, hasDataFrameExpressionInfo } from '../../../../src/abstract-interpretation/data-frame/absint-info'; import type { AbstractDataFrameShape, DataFrameDomain, DataFrameShapeProperty } from '../../../../src/abstract-interpretation/data-frame/dataframe-domain'; import type { DataFrameOperationArgs, DataFrameOperationName } from '../../../../src/abstract-interpretation/data-frame/semantics'; -import { inferDataFrameShapes, resolveIdToDataFrameShape } from '../../../../src/abstract-interpretation/data-frame/shape-inference'; +import { type DataFrameOperation, DataFrameShapeInferenceVisitor } from '../../../../src/abstract-interpretation/data-frame/shape-inference'; import type { AnyAbstractDomain } from '../../../../src/abstract-interpretation/domains/abstract-domain'; import { Bottom, Top } from '../../../../src/abstract-interpretation/domains/lattice'; import type { ArrayRangeValue } from '../../../../src/abstract-interpretation/domains/set-range-domain'; @@ -375,8 +374,9 @@ function getInferredDomainForCriterion( throw new Error(`slicing criterion ${criterion} does not refer to an AST node`); } const cfg = extractCfg(result.normalize, ctx, result.dataflow.graph); - inferDataFrameShapes(cfg, result.dataflow.graph, result.normalize, ctx); - const value = resolveIdToDataFrameShape(node, result.dataflow.graph); + const inference = new DataFrameShapeInferenceVisitor({ controlFlow: cfg, dfg: result.dataflow.graph, normalizedAst: result.normalize, ctx }); + inference.start(); + const value = inference.getValue(node); return [value, node]; } @@ -385,21 +385,19 @@ function getInferredOperationsForCriterion( result: PipelineOutput, criterion: SingleSlicingCriterion, ctx: ReadOnlyFlowrAnalyzerContext -): DataFrameOperation[] { +): readonly DataFrameOperation[] { const idMap = result.dataflow.graph.idMap ?? result.normalize.idMap; const nodeId = slicingCriterionToId(criterion, idMap); - let node: RNode | undefined = idMap.get(nodeId); + let node = idMap.get(nodeId); if(node?.info.role === RoleInParent.FunctionCallName) { node = node.info.parent !== undefined ? idMap.get(node.info.parent) : undefined; } - if(node === undefined) { - throw new Error(`slicing criterion ${criterion} does not refer to an AST node`); - } const cfg = extractCfg(result.normalize, ctx, result.dataflow.graph); - inferDataFrameShapes(cfg, result.dataflow.graph, result.normalize, ctx); + const inference = new DataFrameShapeInferenceVisitor({ controlFlow: cfg, dfg: result.dataflow.graph, normalizedAst: result.normalize, ctx }); + inference.start(); - return hasDataFrameExpressionInfo(node) ? node.info.dataFrame.operations : []; + return inference.getOperations(node?.info.id) ?? []; } function getRealDomainFromOutput( diff --git a/test/functionality/abstract-interpretation/data-frame/inference.test.ts b/test/functionality/abstract-interpretation/data-frame/inference.test.ts index edc1118cd11..8fc42d78e72 100644 --- a/test/functionality/abstract-interpretation/data-frame/inference.test.ts +++ b/test/functionality/abstract-interpretation/data-frame/inference.test.ts @@ -1,10 +1,10 @@ import { beforeAll, describe } from 'vitest'; import { Top } from '../../../../src/abstract-interpretation/domains/lattice'; import { PosIntervalTop } from '../../../../src/abstract-interpretation/domains/positive-interval-domain'; +import { FlowrInlineTextFile } from '../../../../src/project/context/flowr-file'; import { MIN_VERSION_LAMBDA, MIN_VERSION_PIPE } from '../../../../src/r-bridge/lang-4.x/ast/model/versions'; import { withShell } from '../../_helper/shell'; import { assertDataFrameDomain, assertDataFrameOperation, DataFrameShapeOverapproximation, testDataFrameDomain, testDataFrameDomainAgainstReal, testDataFrameDomainWithSource } from './data-frame'; -import { FlowrInlineTextFile } from '../../../../src/project/context/flowr-file'; /** The minimum version required for calling `head` and `tail` with a vector argument, e.g. `head(df, c(1, 2))` */ export const MIN_VERSION_HEAD_TAIL_VECTOR = '4.0.0'; @@ -186,6 +186,48 @@ print(df) shell, ` df <- data.frame(id = 1:5) +while (nrow(df) < 10) { + df <- rbind(df, 10) +} +print(df) + `.trim(), + [['5@df', { colnames: [['id'], []], cols: [1, 1], rows: [5, Infinity] }]] + ); + + assertDataFrameDomain( + shell, + ` +df <- data.frame(id = 1:5) +repeat { + df <- rbind(df, 10) + if (unknown) { + break + } +} +print(df) + `.trim(), + [['8@df', { colnames: [['id'], []], cols: [1, 1], rows: [5, Infinity] }]] + ); + + assertDataFrameDomain( + shell, + ` +df <- data.frame(id = 1:5) +repeat { + if (unknown) { + break + } + df <- rbind(df, 10) +} +print(df) + `.trim(), + [['8@df', { colnames: [['id'], []], cols: [1, 1], rows: [5, Infinity] }]] + ); + + testDataFrameDomain( + shell, + ` +df <- data.frame(id = 1:5) while (nrow(df) < 10) { if (ncol(df) == 1) { df <- cbind(df, name = "A") diff --git a/test/functionality/control-flow/cfg-visit.test.ts b/test/functionality/control-flow/cfg-visit.test.ts index 87eb09b9548..ddc9079c8eb 100644 --- a/test/functionality/control-flow/cfg-visit.test.ts +++ b/test/functionality/control-flow/cfg-visit.test.ts @@ -53,7 +53,7 @@ describe('Control Flow Graph', withTreeSitter(parser => { assertOrderBasic('simple assignment (basic blocks)', 'a <- 1', ['bb-3-exit', 3, 2, 0, 1, '2-exit', '3-exit'], ['bb-3-exit', '3-exit', '2-exit', 1, 0, 2, 3], true); assertOrderBasic('sequence', 'a;b', [2, 0, 1, '2-exit']); assertOrderBasic('while-loop', 'while(TRUE) a + b', - [6, 5, 0, '5-exit', '6-exit', 4, 3, 1, 2, '3-exit', '4-exit'], + [6, 5, 0, 4, 3, 1, 2, '3-exit', '4-exit', '5-exit', '6-exit'], ['6-exit', '5-exit', 0, 5, '4-exit', '3-exit', 2, 1, 3, 4, 6] );